find_opt_references.cc 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Author: zip753@gmail.com (Ivan Nikulin)
  3. Distributed under MIT license.
  4. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  5. */
  6. /* Tool for generating optimal backward references for the input file. Uses
  7. sais-lite library for building suffix array. */
  8. #include <algorithm>
  9. #include <cassert>
  10. #include <cstdio>
  11. #include <cstring>
  12. #include <functional>
  13. #include <utility>
  14. #include <vector>
  15. #include <gflags/gflags.h>
  16. using gflags::ParseCommandLineFlags;
  17. #include "third_party/absl/flags/flag.h"
  18. #include <sais.hxx>
  19. DEFINE_bool(advanced, false, "Advanced searching mode: finds all longest "
  20. "matches at positions that are not covered by matches of length at least "
  21. "max_length. WARNING: uses much more memory than simple mode, especially "
  22. "for small values of min_length.");
  23. DEFINE_int32(min_length, 1, "Minimal length of found backward references.");
  24. /* For advanced mode. */
  25. DEFINE_int32(long_length, 32,
  26. "Maximal length of found backward references for advanced mode.");
  27. DEFINE_int32(skip, 1, "Number of bytes to skip.");
  28. const size_t kFileBufferSize = (1 << 16); // 64KB
  29. typedef int sarray_type; // Can't make it unsigned because of templates :(
  30. typedef uint8_t input_type;
  31. typedef uint32_t lcp_type;
  32. typedef std::pair<int, std::vector<int> > entry_type;
  33. typedef std::function<void(sarray_type*, lcp_type*, size_t, int, int, int, int,
  34. int)> Fn;
  35. void ReadInput(FILE* fin, input_type* storage, size_t input_size) {
  36. size_t last_pos = 0;
  37. size_t available_in;
  38. fseek(fin, 0, SEEK_SET);
  39. do {
  40. available_in = fread(storage + last_pos, 1, kFileBufferSize, fin);
  41. last_pos += available_in;
  42. } while (available_in != 0);
  43. assert(last_pos == input_size);
  44. }
  45. void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp,
  46. size_t size, uint32_t* pos) {
  47. for (int i = 0; i < size; ++i) {
  48. pos[sarray[i]] = i;
  49. }
  50. uint32_t k = 0;
  51. lcp[size - 1] = 0;
  52. for (int i = 0; i < size; ++i) {
  53. if (pos[i] == size - 1) {
  54. k = 0;
  55. continue;
  56. }
  57. uint32_t j = sarray[pos[i] + 1]; // Suffix which follow i-th suffix in SA.
  58. while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) {
  59. ++k;
  60. }
  61. lcp[pos[i]] = k;
  62. if (k > 0) --k;
  63. }
  64. }
  65. inline void PrintReference(sarray_type* sarray, lcp_type* lcp, size_t size,
  66. int idx, int left_ix, int right_ix, int left_lcp,
  67. int right_lcp, FILE* fout) {
  68. int max_lcp_ix;
  69. if (right_ix == size - 1 || (left_ix >= 0 && left_lcp >= right_lcp)) {
  70. max_lcp_ix = left_ix;
  71. } else {
  72. max_lcp_ix = right_ix;
  73. }
  74. int dist = idx - sarray[max_lcp_ix];
  75. assert(dist > 0);
  76. fputc(1, fout);
  77. fwrite(&idx, sizeof(int), 1, fout); // Position in input.
  78. fwrite(&dist, sizeof(int), 1, fout); // Backward distance.
  79. }
  80. inline void GoLeft(sarray_type* sarray, lcp_type* lcp, int idx, int left_ix,
  81. int left_lcp, entry_type* entry) {
  82. entry->first = left_lcp;
  83. if (left_lcp > absl::GetFlag(FLAGS_long_length)) return;
  84. for (; left_ix >= 0; --left_ix) {
  85. if (lcp[left_ix] < left_lcp) break;
  86. if (sarray[left_ix] < idx) {
  87. entry->second.push_back(idx - sarray[left_ix]);
  88. }
  89. }
  90. }
  91. inline void GoRight(sarray_type* sarray, lcp_type* lcp, int idx, size_t size,
  92. int right_ix, int right_lcp, entry_type* entry) {
  93. entry->first = right_lcp;
  94. if (right_lcp > absl::GetFlag(FLAGS_long_length)) return;
  95. for (; right_ix < size - 1; ++right_ix) {
  96. if (lcp[right_ix] < right_lcp) break;
  97. if (sarray[right_ix] < idx) {
  98. entry->second.push_back(idx - sarray[right_ix]);
  99. }
  100. }
  101. }
  102. inline void StoreReference(sarray_type* sarray, lcp_type* lcp, size_t size,
  103. int idx, int left_ix, int right_ix, int left_lcp,
  104. int right_lcp, entry_type* entries) {
  105. if (right_ix == size - 1 || (left_ix >= 0 && left_lcp > right_lcp)) {
  106. // right is invalid or left is better
  107. GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
  108. } else if (left_ix < 0 || (right_ix < size - 1 && right_lcp > left_lcp)) {
  109. // left is invalid or right is better
  110. GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
  111. } else { // both are valid and of equal length
  112. GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
  113. GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
  114. }
  115. }
  116. void ProcessReferences(sarray_type* sarray, lcp_type* lcp, size_t size,
  117. uint32_t* pos, const Fn& process_output) {
  118. int min_length = absl::GetFlag(FLAGS_min_length);
  119. for (int idx = absl::GetFlag(FLAGS_skip); idx < size; ++idx) {
  120. int left_lcp = -1;
  121. int left_ix;
  122. for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) {
  123. if (left_lcp == -1 || left_lcp > lcp[left_ix]) {
  124. left_lcp = lcp[left_ix];
  125. }
  126. if (left_lcp == 0) break;
  127. if (sarray[left_ix] < idx) break;
  128. }
  129. int right_lcp = -1;
  130. int right_ix;
  131. for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) {
  132. if (right_lcp == -1 || right_lcp > lcp[right_ix]) {
  133. right_lcp = lcp[right_ix];
  134. }
  135. // Stop if we have better result from the left side already.
  136. if (right_lcp < left_lcp && left_ix >= 0) break;
  137. if (right_lcp == 0) break;
  138. if (sarray[right_ix] < idx) break;
  139. }
  140. if ((left_ix >= 0 && left_lcp >= min_length) ||
  141. (right_ix < size - 1 && right_lcp >= min_length)) {
  142. process_output(sarray, lcp, size, idx, left_ix, right_ix, left_lcp,
  143. right_lcp);
  144. }
  145. }
  146. }
  147. void ProcessEntries(entry_type* entries, size_t size, FILE* fout) {
  148. int long_length = absl::GetFlag(FLAGS_long_length);
  149. std::vector<std::pair<int, int> > segments;
  150. size_t idx;
  151. for (idx = 0; idx < size;) {
  152. entry_type& entry = entries[idx];
  153. if (entry.first > long_length) {
  154. // Add segment.
  155. if (segments.empty() || segments.back().second < idx) {
  156. segments.push_back({idx, idx + entry.first});
  157. } else {
  158. segments.back().second = idx + entry.first;
  159. }
  160. }
  161. ++idx;
  162. }
  163. printf("Segments generated.\n");
  164. size_t segments_ix = 0;
  165. for (idx = 0; idx < size;) {
  166. if (idx == segments[segments_ix].first) {
  167. // Skip segment.
  168. idx = segments[segments_ix].second;
  169. } else {
  170. for (auto& dist : entries[idx].second) {
  171. fputc(1, fout);
  172. fwrite(&idx, sizeof(int), 1, fout); // Position in input.
  173. fwrite(&dist, sizeof(int), 1, fout); // Backward distance.
  174. }
  175. ++idx;
  176. }
  177. }
  178. }
  179. int main(int argc, char* argv[]) {
  180. base::ParseCommandLine(&argc, &argv);
  181. if (argc != 3) {
  182. printf("usage: %s input_file output_file\n", argv[0]);
  183. return 1;
  184. }
  185. FILE* fin = fopen(argv[1], "rb");
  186. FILE* fout = fopen(argv[2], "w");
  187. fseek(fin, 0, SEEK_END);
  188. int input_size = ftell(fin);
  189. fseek(fin, 0, SEEK_SET);
  190. printf("The file size is %u bytes\n", input_size);
  191. input_type* storage = new input_type[input_size];
  192. ReadInput(fin, storage, input_size);
  193. fclose(fin);
  194. sarray_type* sarray = new sarray_type[input_size];
  195. saisxx(storage, sarray, input_size);
  196. printf("Suffix array calculated.\n");
  197. // Inverse suffix array.
  198. uint32_t* pos = new uint32_t[input_size];
  199. lcp_type* lcp = new lcp_type[input_size];
  200. BuildLCP(storage, sarray, lcp, input_size, pos);
  201. printf("LCP array constructed.\n");
  202. delete[] storage;
  203. using std::placeholders::_1;
  204. using std::placeholders::_2;
  205. using std::placeholders::_3;
  206. using std::placeholders::_4;
  207. using std::placeholders::_5;
  208. using std::placeholders::_6;
  209. using std::placeholders::_7;
  210. using std::placeholders::_8;
  211. entry_type* entries;
  212. if (absl::GetFlag(FLAGS_advanced)) {
  213. entries = new entry_type[input_size];
  214. for (size_t i = 0; i < input_size; ++i) entries[i].first = -1;
  215. }
  216. Fn print = std::bind(PrintReference, _1, _2, _3, _4, _5, _6, _7, _8, fout);
  217. Fn store = std::bind(StoreReference, _1, _2, _3, _4, _5, _6, _7, _8, entries);
  218. ProcessReferences(sarray, lcp, input_size, pos,
  219. absl::GetFlag(FLAGS_advanced) ? store : print);
  220. printf("References processed.\n");
  221. if (absl::GetFlag(FLAGS_advanced)) {
  222. int good_cnt = 0;
  223. uint64_t avg_cnt = 0;
  224. for (size_t i = 0; i < input_size; ++i) {
  225. if (entries[i].first != -1) {
  226. ++good_cnt;
  227. avg_cnt += entries[i].second.size();
  228. }
  229. }
  230. printf("Number of covered positions = %d\n", good_cnt);
  231. printf("Average number of references per covered position = %.4lf\n",
  232. static_cast<double>(avg_cnt) / good_cnt);
  233. ProcessEntries(entries, input_size, fout);
  234. printf("Entries processed.\n");
  235. }
  236. fclose(fout);
  237. return 0;
  238. }