(learn&think)

不浮躁,不自傲,学习,思考,总结

Programming Pearls: Column14-15

| Comments

Book notes

Hash 存储 word 次数

typedef struct Node* Nodeptr;
struct Node {
  Node(string inword, int incount, Nodeptr innext) {
    word = inword;
    count = incount;
    next = innext;
  }
  string word;
  int count;
  Nodeptr next;
};

#define NHASH 29989
#define MULT 31
Nodeptr bin[NHASH];

unsigned int Hash(const string &str) {
  unsigned int h = 0;
  for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
    h = MULT * h + *it;
  }
  return h % NHASH;
}

void InWord(const string &str) {
  Nodeptr p;
  int h;
  h = Hash(str);
  for (p = bin[h]; p != NULL; p = p->next) {
    if (str.compare(p->word) == 0) {
      (p->count)++;
      return;
    }
  }
  p = new Node(str, 1, bin[h]);
  bin[h] = p;
}

int main(int argc, char *argv[]) {
  string str;
  int i;
  for (i = 0; i < NHASH; ++i) {
    bin[i] = NULL;
  }
  while (cin >> str) {
    InWord(str);
  }
  for (i = 0; i < NHASH; ++i) {
    for (Nodeptr p = bin[i]; p != NULL; p = p->next) {
      cout << p->word << " " << p->count << endl;
    }
  }
  return 0;
}

Markov 产生随机词汇

利用指针指向不同单词的开头,并按照 K 个单词对比方式排序,利用二分搜索定位相同 K 长度的文本,并利用Reservoir sampling在不知道长度的情况下,均等的随机选取一个。

#define MAXINPUT 4000000
#define MAXWORDS 800000
#define K 2
char input_letters[MAXINPUT];
char *word[MAXWORDS];

int WordNcmp(const char *p, const char *q, int n) {
  while (*p == *q) {
    if (*p == 0 && --n == 0) {
      return 0;
    }
    ++p;
    ++q;
  }
  return *p - *q;
}


int SortCmp(const void *a, const void *b) {
  const char **p = (const char**)(a);
  const char **q = (const char**)(b);
  return WordNcmp(*p, *q, K);
}

char* SkipNword(char *p, int n) {
  for (; n > 0; p++) {
    if (*p == 0) {
      --n;
    }
  }
  return p;
}

int FindPhrase(char **word, int n, char *phrase) {
  int l = -1;
  int u = n;
  int m;
  while (l + 1 != u) {
    m = (l + u) / 2;
    if (WordNcmp(word[m], phrase, K) < 0) {
      l = m;
    } else {
      u = m;
    }
  }
  return u;
}

int main(int argc, char *argv[]) {
  int nword = 0;
  word[0] = input_letters;
  while (scanf("%s", word[nword]) != EOF) {
    word[nword + 1] = word[nword] + strlen(word[nword]) + 1;
    nword++;
    if (nword == MAXWORDS) {
      break;
    }
  }
  int i;
  for (i = 0; i < K; ++i) {
    word[nword][i] = 0;
  }
  for (i = 0; i < K; ++i) {
    printf("%s ", word[i]);
  }
  qsort(word, nword, sizeof(word[0]), SortCmp);
  char *phrase = input_letters;
  int printlen = 100;
  int find_index;
  char *p;
  for (; printlen > 0; --printlen) {
    int find_index = FindPhrase(word, nword, phrase);
    for (i = 0; WordNcmp(phrase, word[find_index + i], K) == 0; ++i) {
      if ((rand() % (i + 1)) == 0) {
        p = word[find_index + i];
      }
    }
    phrase = SkipNword(p, 1);
    if (strlen(SkipNword(phrase, K - 1)) == 0) {
      break;
    }
    printf("%s ", SkipNword(phrase, K - 1));
  }
  printf("\n");
  return 0;
}

Markov 利用 Hash 产生随机词汇

利用 Hash 表加快搜索相同 K 长度的文本。

#define MAXINPUT 4000000
#define MAXWORDS 800000
#define K 2
char input_letters[MAXINPUT];
char *word[MAXWORDS];

int WordNcmp(const char *p, const char *q, int n) {
  while (*p == *q) {
    if (*p == 0 && --n == 0) {
      return 0;
    }
    ++p;
    ++q;
  }
  return *p - *q;
}

char* SkipNword(char *p, int n) {
  for (; n > 0; p++) {
    if (*p == 0) {
      --n;
    }
  }
  return p;
}

#define NHASH 499979
#define MULT 31
int bin[NHASH];
int next[MAXWORDS];

unsigned int Hash(char *str) {
  unsigned int h = 0;
  char *p = str;
  for (int n = K; n > 0; p++) {
    h = MULT * h + (unsigned char)(*p);
    if (*p == 0) {
      --n;
    }
  }
  return h % NHASH;
}

void InitHash(char **word, int nword) {
  int i;
  for (i = 0; i < NHASH; ++i) {
    bin[i] = - 1;
  }
  for (i = 0; i < nword; ++i) {
    unsigned int h = Hash(word[i]);
    next[i] = bin[h];
    bin[h] = i;
  }
}

int main(int argc, char *argv[]) {
  int nword = 0;
  word[0] = input_letters;
  while (scanf("%s", word[nword]) != EOF) {
    word[nword + 1] = word[nword] + strlen(word[nword]) + 1;
    nword++;
    if (nword == MAXWORDS) {
      break;
    }
  }
  int i;
  for (i = 0; i < K; ++i) {
    word[nword][i] = 0;
  }
  InitHash(word, nword);
  for (i = 0; i < K; ++i) {
    printf("%s ", word[i]);
  }
  char *phrase = input_letters;
  int printlen = 100;
  char *p;
  for (; printlen > 0; --printlen) {
    i = 0;
    for (int j = bin[Hash(phrase)]; j >= 0; j = next[j]) {
      if (WordNcmp(word[j], phrase, K) == 0 && (rand() % (++i) == 0)) {
        p = word[j];
      }
    }
    phrase = SkipNword(p, 1);
    if (strlen(SkipNword(phrase, K - 1)) == 0) {
      break;
    }
    printf("%s ", SkipNword(phrase, K - 1));
  }
  printf("\n");
  return 0;
}

Problems

Column14-2

void SiftDown(int *x, int l, int u) {
  int i = l;
  int child;
  for (;;) {
    child = i * 2;
    if (child > u) {
      break;
    }
    if (child + 1 <= u) {
      if (x[child + 1] < x[child]) {
        child++;
      }
    }
    if (x[i] <= x[child]) {
      break;
    }
    swap(x[i], x[child]);
    i = child;
  }
}

void HeapSort(int *x, int n) {
  int i;
  for (i = n / 2; i >= 1; --i) {
    SiftDown(x, i, n);
  }
  for (i = n; i >= 2; --i) {
    swap(x[1], x[i]);
    SiftDown(x, 1, i - 1);
  }
}

Column15-8

找出最长重复超过 M 次的字符串。

经过排序后,越是相邻的越是相同的多,至少重复 M 次,就是计算相邻 M 个位置的字符所重复的字符长度,即 ComLen(pstr[i], pstr[i + kM])

int CmpPstr(const void *a, const void *b) {
  const char **p = (const char **)a;
  const char **q = (const char **)b;
  return strcmp(*p, *q);
}

int ComLen(char *p, char *q) {
  int i = 0;
  while (*p && (*p == *q)) {
    ++i;
    ++p;
    ++q;
  }
  return i;
}

#define kMaxN 500000
#define kM 1
char str[kMaxN];
char *pstr[kMaxN];
int main(int argc, char *argv[]) {
  int ch;
  int n = 0;
  while ((ch = getchar()) != EOF) {
    str[n] = ch;
    pstr[n] = &str[n];
    ++n;
  }
  str[n] = 0;
  qsort(pstr, n, sizeof(char *), CmpPstr);
  int maxlen = 0;
  int maxindex = 0;
  for (int i = 0; i < n - kM; ++i) {
    if (ComLen(pstr[i], pstr[i + kM]) > maxlen) {
      maxlen = ComLen(pstr[i], pstr[i + kM]);
      maxindex = i;
    }
  }
  printf("%.*s\n", maxlen, pstr[maxindex]);
  return 0;
}

Column15-9

找出两个文本中最长的共同字符串。

经典Longest common substring problem. 利用 Dynamic Programming 解决。复杂度 O(mn).

vector<string> LongestCommonString(const string &s, const string &t) {
  vector<vector<int> > array;
  int len_s = s.size();
  int len_t = t.size();
  int i, j;
  array.resize(len_s);
  for (i = 0; i < len_s; ++i) {
    array[i].resize(len_t);
  }
  int max_len = 0;
  vector<int> end_indexs;
  for (i = 0; i < len_s; ++i) {
    for (j = 0; j < len_t; ++j) {
      if (s[i] == t[j]) {
        if (i == 0 || j == 0) {
          array[i][j] = 1;
        } else {
          array[i][j] = array[i-1][j-1] + 1;
        }
        if (array[i][j] == max_len) {
          end_indexs.push_back(i);
        } else if (array[i][j] > max_len) {
          max_len = array[i][j];
          end_indexs.clear();
          end_indexs.push_back(i);
        }
      }
    }
  }
  vector<string> res;
  for (vector<int>::iterator it = end_indexs.begin(); it != end_indexs.end();
       ++it) {
    res.push_back(s.substr(*it - max_len + 1, max_len));
  }
  return res;
}

Column15-11

产生单词层次的 Markov 文本。

int main(int argc, char *argv[]) {
  const int kMax = 50000;
  const int kK = 5;
  const int kPrintlen = 1000;
  char str[kMax];
  int c, n;
  n = 0;
  while ((c = getchar()) != EOF) {
    str[n++] = c;
  }
  str[n] = 0;
  char *p, *q, *next_p;
  p = str;
  int i, eq_sofar, j;
  for (i = 0; i < kK; ++i) {
    printf("%c", str[i]);
  }
  for (i = 0; i < kPrintlen; ++i) {
    eq_sofar = 0;
    for (q = str; q < str + n - kK + 1; ++q) {
      for (j = 0; j < kK && *(p + j) == *(q + j); ++j) {
      }
      if (j == kK) {
        eq_sofar++;
        if (rand() % eq_sofar == 0) {
          next_p = q;
        }
      }
    }
    c = *(next_p + kK);
    if (c == 0) {
      break;
    }
    putchar(c);
    p = next_p + 1;
  }
  return 0;
}

Comments