Trie 与补全、纠错

Table of Contents


介绍

Trie 树也就是所谓的字典树,一般用来做字符串前缀匹配,词频统计。如果数据量比较大,而且数据重复率比较高(或者说前缀重复率比较高),用 trie 做词频统计可以节省很多空间。如果对空间没要求,hash 算法还是要好于用 trie。另一个用处是可以对字符串进行字典序排序(深度优先搜索)。 这个用 hash 也是不能做的。Trie 的插入和查询时间复杂度都为 O(k) , 其中 k 为 key 的长度,与 Trie 中保存了多少个元素无关。

应用

浏览器自动补全

这个是 google 的自动补全功能,当然 baidu 的也有。

GoogleSuggest.png

Figure 1: Google AutoComplete

拼写检查

例如 word 里面的

SpellCheck.png

Figure 2: Word

T9 predictive text

208_T9.jpg

Figure 4: T9 which stands for Text on 9 keys, was used on phones to input texts during the late 1990s.

Prefix Trie

先看实现代码

定义 Trie 结构:

#include <iostream>
#define MAX 26
using namespace std;

typedef struct TrieNode {
    bool isStr; // 字符串是否结束
    int prefixCount; // 以该字符结束的前缀有多少个
    struct TrieNode *next[MAX];
} Trie;

插入我们可以这样来:

void insert(Trie *root, const char *s) {
    if (!root || *s == '\0')
        return;
    int i;
    Trie *p = root;

    while(*s!='\0') {
        if (!p->next[*s-'a']) {
            Trie *temp = new Trie();
            for (i = 0;i < MAX; i ++){
                temp->next[i] = NULL;
            }
            temp->isStr = false;
            temp->prefixCount=1;
            p->next[*s-'a'] = temp;
            p = p->next[*s-'a'];
        } else {
            p = p->next[*s - 'a'];
            p->prefixCount += 1;
        }
        s ++;
    }
    p->isStr = true;
}

查询:

int search(Trie *root, const char *s) {
    Trie *p = root;
    while(p != NULL && *s != '\0') {
        if (p->next[*s-'a']) {
            p = p->next[*s - 'a'];
            s ++;
        }
    }
    return (p != NULL && p->isStr == true);
}

查找前缀出现次数:

int countPrefix(Trie *root, char *s) {
    if (!root || *s == '\0')
        return 0;
    Trie *p = root;
    while(p != NULL && *s != '\0') {
        if (p->next[*s - 'a']) {
            p = p->next[*s - 'a'];
            s ++;
        } else {
            return 0;
        }
    }
    if(p == NULL)
        return 0;
    else
        return p->prefixCount;
}

自动补全功能和纠错

核心代码如下, 比较简单,不解释了。完整代码(这里)。

void FindFullStr(TreeNode *root, string query, vector<string> &result) {
    TreeNode *p = root;
    if (root == NULL)
        return;
    if (root->isEnd)
        result.push_back(query);
    for (int i = 0;i < N; i ++) {
        if (root->next[i] != NULL) {
            char c = (char)i;
            FindFullStr(root->next[i], query+c, result);
        }
    }
}

void FindStr(string src, vector<string> &result) {
    int cur = 0;
    TreeNode *p = this->root;
    unsigned char index = (unsigned char) src[cur];

    while (cur < src.size() && p->next[index]) {
        p = p->next[index];
        ++cur;
        index = (unsigned char) src[cur];
    }
    if (cur != src.size() || p == NULL) {
        return;
    }
    FindFullStr(p, src, result);
}

    int IsError(string query) {
    TreeNode *p = this->root;
    if (p == NULL)
        return 0;
    int cur = 0;
    unsigned char index = (unsigned char) query[cur];
    while(cur < query.size() && p->next[index]) {
        p = p->next[index];
        ++ cur;
        index = (unsigned char) query[cur];
    }
    if (!p->isEnd)
        return cur;

    return 0;
}

void CorrectSpell(string query, vector<string> &res) {
    int index = IsError(query);
    if (index) {
        string sub = query.substr(0, index);
        FindStr(sub, res);
    }
}


    int main(int argc, char *argv[])
{
    Trie *p = new Trie();
    p->InsertNode("hello world");
    p->InsertNode("hello baby");
    p->InsertNode("hello emm..");
    p->InsertNode("hello");
    p->InsertNode("你好啊");
    p->InsertNode("你真好");
    p->InsertNode("你好坏");

    string query;
    cout << "Please input query" << endl;
    cin >> query;

    vector<string> result;
    //p->FindStr(query, result);

    p->CorrectSpell(query, result);

    vector<string>::iterator it;
    if (result.size() > 0)
        cout << "You may want input:" << endl;
    else
        cout << "You input is correct" << endl;
    for(it = result.begin(); it != result.end(); ++ it){
        cout << *it << endl;
    }
    return 0;
}

实验效果如下:

20180604_122203_1118wlQ.png

20180604_122249_1118K6c.png

扩展:Double Array Trie

Reference