Book notes
Hash 存储 word 次数
typedef struct Node* Nodeptr; struct Node { Node(string inword, int incount, Nodeptr innext) { word = inword; count = incount; next = innext; } string word; int count; Nodeptr next; }; #define NHASH 29989 #define MULT 31 Nodeptr bin[NHASH]; unsigned int Hash(const string &str) { unsigned int h = 0; for (string::const_iterator it = str.begin(); it != str.end(); ++it) { h = MULT * h + *it; } return h % NHASH; } void InWord(const string &str) { Nodeptr p; int h; h = Hash(str); for (p = bin[h]; p != NULL; p = p->next) { if (str.compare(p->word) == 0) { (p->count)++; return; } } p = new Node(str, 1, bin[h]); bin[h] = p; } int main(int argc, char *argv[]) { string str; int i; for (i = 0; i < NHASH; ++i) { bin[i] = NULL; } while (cin >> str) { InWord(str); } for (i = 0; i < NHASH; ++i) { for (Nodeptr p = bin[i]; p != NULL; p = p->next) { cout << p->word << " " << p->count << endl; } } return 0; }
Markov 产生随机词汇
利用指针指向不同单词的开头,并按照 K 个单词对比方式排序,利用二分搜索定位相同 K 长度的文本,并利用Reservoir sampling在不知道长度的情况下,均等的随机选取一个。
#define MAXINPUT 4000000 #define MAXWORDS 800000 #define K 2 char input_letters[MAXINPUT]; char *word[MAXWORDS]; int WordNcmp(const char *p, const char *q, int n) { while (*p == *q) { if (*p == 0 && --n == 0) { return 0; } ++p; ++q; } return *p - *q; } int SortCmp(const void *a, const void *b) { const char **p = (const char**)(a); const char **q = (const char**)(b); return WordNcmp(*p, *q, K); } char* SkipNword(char *p, int n) { for (; n > 0; p++) { if (*p == 0) { --n; } } return p; } int FindPhrase(char **word, int n, char *phrase) { int l = -1; int u = n; int m; while (l + 1 != u) { m = (l + u) / 2; if (WordNcmp(word[m], phrase, K) < 0) { l = m; } else { u = m; } } return u; } int main(int argc, char *argv[]) { int nword = 0; word[0] = input_letters; while (scanf("%s", word[nword]) != EOF) { word[nword + 1] = word[nword] + strlen(word[nword]) + 1; nword++; if (nword == MAXWORDS) { break; } } int i; for (i = 0; i < K; ++i) { word[nword][i] = 0; } for (i = 0; i < K; ++i) { printf("%s ", word[i]); } qsort(word, nword, sizeof(word[0]), SortCmp); char *phrase = input_letters; int printlen = 100; int find_index; char *p; for (; printlen > 0; --printlen) { int find_index = FindPhrase(word, nword, phrase); for (i = 0; WordNcmp(phrase, word[find_index + i], K) == 0; ++i) { if ((rand() % (i + 1)) == 0) { p = word[find_index + i]; } } phrase = SkipNword(p, 1); if (strlen(SkipNword(phrase, K - 1)) == 0) { break; } printf("%s ", SkipNword(phrase, K - 1)); } printf("\n"); return 0; }
Markov 利用 Hash 产生随机词汇
利用 Hash 表加快搜索相同 K 长度的文本。
#define MAXINPUT 4000000 #define MAXWORDS 800000 #define K 2 char input_letters[MAXINPUT]; char *word[MAXWORDS]; int WordNcmp(const char *p, const char *q, int n) { while (*p == *q) { if (*p == 0 && --n == 0) { return 0; } ++p; ++q; } return *p - *q; } char* SkipNword(char *p, int n) { for (; n > 0; p++) { if (*p == 0) { --n; } } return p; } #define NHASH 499979 #define MULT 31 int bin[NHASH]; int next[MAXWORDS]; unsigned int Hash(char *str) { unsigned int h = 0; char *p = str; for (int n = K; n > 0; p++) { h = MULT * h + (unsigned char)(*p); if (*p == 0) { --n; } } return h % NHASH; } void InitHash(char **word, int nword) { int i; for (i = 0; i < NHASH; ++i) { bin[i] = - 1; } for (i = 0; i < nword; ++i) { unsigned int h = Hash(word[i]); next[i] = bin[h]; bin[h] = i; } } int main(int argc, char *argv[]) { int nword = 0; word[0] = input_letters; while (scanf("%s", word[nword]) != EOF) { word[nword + 1] = word[nword] + strlen(word[nword]) + 1; nword++; if (nword == MAXWORDS) { break; } } int i; for (i = 0; i < K; ++i) { word[nword][i] = 0; } InitHash(word, nword); for (i = 0; i < K; ++i) { printf("%s ", word[i]); } char *phrase = input_letters; int printlen = 100; char *p; for (; printlen > 0; --printlen) { i = 0; for (int j = bin[Hash(phrase)]; j >= 0; j = next[j]) { if (WordNcmp(word[j], phrase, K) == 0 && (rand() % (++i) == 0)) { p = word[j]; } } phrase = SkipNword(p, 1); if (strlen(SkipNword(phrase, K - 1)) == 0) { break; } printf("%s ", SkipNword(phrase, K - 1)); } printf("\n"); return 0; }
Problems
Column14-2
void SiftDown(int *x, int l, int u) { int i = l; int child; for (;;) { child = i * 2; if (child > u) { break; } if (child + 1 <= u) { if (x[child + 1] < x[child]) { child++; } } if (x[i] <= x[child]) { break; } swap(x[i], x[child]); i = child; } } void HeapSort(int *x, int n) { int i; for (i = n / 2; i >= 1; --i) { SiftDown(x, i, n); } for (i = n; i >= 2; --i) { swap(x[1], x[i]); SiftDown(x, 1, i - 1); } }
Column15-8
找出最长重复超过 M 次的字符串。
经过排序后,越是相邻的越是相同的多,至少重复 M 次,就是计算相邻 M 个位置的字符所重复的字符长度,即 ComLen(pstr[i], pstr[i + kM])
int CmpPstr(const void *a, const void *b) { const char **p = (const char **)a; const char **q = (const char **)b; return strcmp(*p, *q); } int ComLen(char *p, char *q) { int i = 0; while (*p && (*p == *q)) { ++i; ++p; ++q; } return i; } #define kMaxN 500000 #define kM 1 char str[kMaxN]; char *pstr[kMaxN]; int main(int argc, char *argv[]) { int ch; int n = 0; while ((ch = getchar()) != EOF) { str[n] = ch; pstr[n] = &str[n]; ++n; } str[n] = 0; qsort(pstr, n, sizeof(char *), CmpPstr); int maxlen = 0; int maxindex = 0; for (int i = 0; i < n - kM; ++i) { if (ComLen(pstr[i], pstr[i + kM]) > maxlen) { maxlen = ComLen(pstr[i], pstr[i + kM]); maxindex = i; } } printf("%.*s\n", maxlen, pstr[maxindex]); return 0; }
Column15-9
找出两个文本中最长的共同字符串。
经典Longest common substring problem. 利用 Dynamic Programming 解决。复杂度 O(mn).
vector<string> LongestCommonString(const string &s, const string &t) { vector<vector<int> > array; int len_s = s.size(); int len_t = t.size(); int i, j; array.resize(len_s); for (i = 0; i < len_s; ++i) { array[i].resize(len_t); } int max_len = 0; vector<int> end_indexs; for (i = 0; i < len_s; ++i) { for (j = 0; j < len_t; ++j) { if (s[i] == t[j]) { if (i == 0 || j == 0) { array[i][j] = 1; } else { array[i][j] = array[i-1][j-1] + 1; } if (array[i][j] == max_len) { end_indexs.push_back(i); } else if (array[i][j] > max_len) { max_len = array[i][j]; end_indexs.clear(); end_indexs.push_back(i); } } } } vector<string> res; for (vector<int>::iterator it = end_indexs.begin(); it != end_indexs.end(); ++it) { res.push_back(s.substr(*it - max_len + 1, max_len)); } return res; }
Column15-11
产生单词层次的 Markov 文本。
int main(int argc, char *argv[]) { const int kMax = 50000; const int kK = 5; const int kPrintlen = 1000; char str[kMax]; int c, n; n = 0; while ((c = getchar()) != EOF) { str[n++] = c; } str[n] = 0; char *p, *q, *next_p; p = str; int i, eq_sofar, j; for (i = 0; i < kK; ++i) { printf("%c", str[i]); } for (i = 0; i < kPrintlen; ++i) { eq_sofar = 0; for (q = str; q < str + n - kK + 1; ++q) { for (j = 0; j < kK && *(p + j) == *(q + j); ++j) { } if (j == kK) { eq_sofar++; if (rand() % eq_sofar == 0) { next_p = q; } } } c = *(next_p + kK); if (c == 0) { break; } putchar(c); p = next_p + 1; } return 0; }