deorummolae.cc 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. #include "deorummolae.h"
  2. #include <array>
  3. #include <cstdio>
  4. #include <sais.hxx>
  5. #if defined(_MSC_VER)
  6. #include <intrin.h> /* __popcnt64 */
  7. #endif
  8. /* Used for quick SA-entry to file mapping. Each file is padded to size that
  9. is a multiple of chunk size. */
  10. #define CHUNK_SIZE 64
  11. /* Length of substring that is considered to be covered by dictionary string. */
  12. #define CUT_MATCH 6
  13. /* Minimal dictionary entry size. */
  14. #define MIN_MATCH 24
  15. /* Non tunable definitions. */
  16. #define CHUNK_MASK (CHUNK_SIZE - 1)
  17. #define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
  18. /* File coverage: every bit set to 1 denotes a file covered by an isle. */
  19. typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
  20. /* Symbol of text alphabet. */
  21. typedef int32_t TextChar;
  22. /* Pointer to position in text. */
  23. typedef uint32_t TextIdx;
  24. /* SAIS sarray_type; unfortunately, must be a signed type. */
  25. typedef int32_t TextSaIdx;
  26. static size_t popcount(uint64_t u) {
  27. #if defined(_MSC_VER)
  28. return static_cast<size_t>(__popcnt64(u));
  29. #else
  30. return static_cast<size_t>(__builtin_popcountll(u));
  31. #endif
  32. }
  33. /* Condense terminators and pad file entries. */
  34. static void rewriteText(std::vector<TextChar>* text) {
  35. TextChar terminator = text->back();
  36. TextChar prev = terminator;
  37. TextIdx to = 0;
  38. for (TextIdx from = 0; from < text->size(); ++from) {
  39. TextChar next = text->at(from);
  40. if (next < 256 || prev < 256) {
  41. text->at(to++) = next;
  42. if (next >= 256) terminator = next;
  43. }
  44. prev = next;
  45. }
  46. text->resize(to);
  47. if (text->empty()) text->push_back(terminator);
  48. while (text->size() & CHUNK_MASK) text->push_back(terminator);
  49. }
  50. /* Reenumerate terminators for smaller alphabet. */
  51. static void remapTerminators(std::vector<TextChar>* text,
  52. TextChar* next_terminator) {
  53. TextChar prev = -1;
  54. TextChar x = 256;
  55. for (TextIdx i = 0; i < text->size(); ++i) {
  56. TextChar next = text->at(i);
  57. if (next < 256) { // Char.
  58. // Do nothing.
  59. } else if (prev < 256) { // Terminator after char.
  60. next = x++;
  61. } else { // Terminator after terminator.
  62. next = prev;
  63. }
  64. text->at(i) = next;
  65. prev = next;
  66. }
  67. *next_terminator = x;
  68. }
  69. /* Combine all file entries; create mapping position->file. */
  70. static void buildFullText(std::vector<std::vector<TextChar>>* data,
  71. std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
  72. std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
  73. file_map->resize(0);
  74. file_offset->resize(0);
  75. full_text->resize(0);
  76. for (TextIdx i = 0; i < data->size(); ++i) {
  77. file_offset->push_back(full_text->size());
  78. std::vector<TextChar>& file = data->at(i);
  79. rewriteText(&file);
  80. full_text->insert(full_text->end(), file.begin(), file.end());
  81. file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
  82. }
  83. if (false) remapTerminators(full_text, next_terminator);
  84. }
  85. /* Build longest-common-prefix based on suffix array and text.
  86. TODO(eustas): borrowed -> unknown efficiency. */
  87. static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
  88. std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
  89. TextIdx size = static_cast<TextIdx>(text->size());
  90. lcp->resize(size);
  91. TextIdx k = 0;
  92. lcp->at(size - 1) = 0;
  93. for (TextIdx i = 0; i < size; ++i) {
  94. if (invese_sa->at(i) == size - 1) {
  95. k = 0;
  96. continue;
  97. }
  98. // Suffix which follow i-th suffix.
  99. TextIdx j = sa->at(invese_sa->at(i) + 1);
  100. while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
  101. ++k;
  102. }
  103. lcp->at(invese_sa->at(i)) = k;
  104. if (k > 0) --k;
  105. }
  106. }
  107. /* Isle is a range in SA with LCP not less than some value.
  108. When we raise the LCP requirement, the isle sunks and smaller isles appear
  109. instead. */
  110. typedef struct {
  111. TextIdx lcp;
  112. TextIdx l;
  113. TextIdx r;
  114. Coverage coverage;
  115. } Isle;
  116. /* Helper routine for `cutMatch`. */
  117. static void poisonData(TextIdx pos, TextIdx length,
  118. std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
  119. std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
  120. TextIdx f = file_map->at(pos / CHUNK_SIZE);
  121. pos -= file_offset->at(f);
  122. std::vector<TextChar>& file = data->at(f);
  123. TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
  124. for (TextIdx j = 0; j < l; j++, pos++) {
  125. if (file[pos] >= 256) continue;
  126. if (file[pos + 1] >= 256) {
  127. file[pos] = file[pos + 1];
  128. } else if (pos > 0 && file[pos - 1] >= 256) {
  129. file[pos] = file[pos - 1];
  130. } else {
  131. file[pos] = (*next_terminator)++;
  132. }
  133. }
  134. }
  135. /* Remove substrings of a given match from files.
  136. Substrings are replaced with unique terminators, so next iteration SA would
  137. not allow to cross removed areas. */
  138. static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
  139. TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
  140. std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
  141. std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
  142. while (length >= CUT_MATCH) {
  143. TextIdx i = index;
  144. while (lcp->at(i) >= length) {
  145. i++;
  146. poisonData(
  147. sa->at(i), length, data, file_map, file_offset, next_terminator);
  148. }
  149. while (true) {
  150. poisonData(
  151. sa->at(index), length, data, file_map, file_offset, next_terminator);
  152. if (index == 0 || lcp->at(index - 1) < length) break;
  153. index--;
  154. }
  155. length--;
  156. index = invese_sa->at(sa->at(index) + 1);
  157. }
  158. }
  159. std::string DM_generate(size_t dictionary_size_limit,
  160. const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
  161. {
  162. TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
  163. if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
  164. fprintf(stderr, "dictionary_size_limit is too large\n");
  165. return "";
  166. }
  167. }
  168. /* Could use 256 + '0' for easier debugging. */
  169. TextChar next_terminator = 256;
  170. std::string output;
  171. std::vector<std::vector<TextChar>> data;
  172. TextIdx offset = 0;
  173. size_t num_samples = sample_sizes.size();
  174. if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
  175. for (size_t n = 0; n < num_samples; ++n) {
  176. TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
  177. if (delta != sample_sizes[n]) {
  178. fprintf(stderr, "sample is too large\n");
  179. return "";
  180. }
  181. if (delta == 0) {
  182. fprintf(stderr, "0-length samples are prohibited\n");
  183. return "";
  184. }
  185. TextIdx next_offset = offset + delta;
  186. if (next_offset <= offset) {
  187. fprintf(stderr, "corpus is too large\n");
  188. return "";
  189. }
  190. data.push_back(
  191. std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
  192. offset = next_offset;
  193. data.back().push_back(next_terminator++);
  194. }
  195. /* Most arrays are allocated once, and then just resized to smaller and
  196. smaller sizes. */
  197. std::vector<TextChar> full_text;
  198. std::vector<TextIdx> file_map;
  199. std::vector<TextIdx> file_offset;
  200. std::vector<TextIdx> sa;
  201. std::vector<TextIdx> invese_sa;
  202. std::vector<TextIdx> lcp;
  203. std::vector<Isle> isles;
  204. std::vector<char> output_data;
  205. TextIdx total = 0;
  206. TextIdx total_cost = 0;
  207. TextIdx best_cost;
  208. Isle best_isle;
  209. size_t min_count = num_samples;
  210. while (true) {
  211. TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
  212. buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
  213. sa.resize(full_text.size());
  214. /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
  215. saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
  216. static_cast<TextChar>(full_text.size()), next_terminator);
  217. invese_sa.resize(full_text.size());
  218. for (TextIdx i = 0; i < full_text.size(); ++i) {
  219. invese_sa[sa[i]] = i;
  220. }
  221. buildLcp(&full_text, &sa, &lcp, &invese_sa);
  222. /* Do not rebuild SA/LCP, just use different selection. */
  223. retry:
  224. best_cost = 0;
  225. best_isle = {0, 0, 0, {{0}}};
  226. isles.resize(0);
  227. isles.push_back(best_isle);
  228. for (TextIdx i = 0; i < lcp.size(); ++i) {
  229. TextIdx l = i;
  230. Coverage cov = {{0}};
  231. size_t f = file_map[sa[i] / CHUNK_SIZE];
  232. cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
  233. while (lcp[i] < isles.back().lcp) {
  234. Isle& top = isles.back();
  235. top.r = i;
  236. l = top.l;
  237. for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
  238. size_t count = 0;
  239. for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
  240. TextIdx effective_lcp = top.lcp;
  241. /* Restrict (last) dictionary entry length. */
  242. if (effective_lcp > max_match) effective_lcp = max_match;
  243. TextIdx cost = count * effective_lcp;
  244. if (cost > best_cost && count >= min_count &&
  245. effective_lcp >= MIN_MATCH) {
  246. best_cost = cost;
  247. best_isle = top;
  248. best_isle.lcp = effective_lcp;
  249. }
  250. isles.pop_back();
  251. for (size_t x = 0; x < cov.size(); ++x) {
  252. isles.back().coverage[x] |= cov[x];
  253. }
  254. }
  255. if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
  256. for (size_t x = 0; x < cov.size(); ++x) {
  257. isles.back().coverage[x] |= cov[x];
  258. }
  259. }
  260. /* When saturated matches do not match length restrictions, lower the
  261. saturation requirements. */
  262. if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
  263. if (min_count >= 8) {
  264. min_count = (min_count * 7) / 8;
  265. fprintf(stderr, "Retry: min_count=%zu\n", min_count);
  266. goto retry;
  267. }
  268. break;
  269. }
  270. /* Save the entry. */
  271. fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
  272. total_cost, best_cost, total, best_isle.lcp);
  273. int* piece = &full_text[sa[best_isle.l]];
  274. output.insert(output.end(), piece, piece + best_isle.lcp);
  275. total += best_isle.lcp;
  276. total_cost += best_cost;
  277. cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
  278. &next_terminator, &file_map, &file_offset);
  279. if (total >= dictionary_size_limit) break;
  280. }
  281. return output;
  282. }