sieve.cc 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. #include "sieve.h"
  2. /* Pointer to position in (combined corpus) text. */
  3. typedef uint32_t TextIdx;
  4. /* Index of sample / generation. */
  5. typedef uint16_t SampleIdx;
  6. typedef struct Slot {
  7. TextIdx next;
  8. TextIdx offset;
  9. SampleIdx presence;
  10. SampleIdx mark;
  11. } Slot;
  12. static const TextIdx kNowhere = static_cast<TextIdx>(-1);
  13. static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
  14. TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
  15. TextIdx from = kNowhere;
  16. TextIdx to = kNowhere;
  17. TextIdx result = 0;
  18. SampleIdx targetPresence = minPresence;
  19. for (TextIdx i = 0; i < end; ++i) {
  20. if (i == middle) {
  21. targetPresence++;
  22. }
  23. Slot& item = map[shortcut[i]];
  24. if (item.mark != iteration) {
  25. item.mark = iteration;
  26. if (item.presence >= targetPresence) {
  27. if ((to == kNowhere) || (to < i)) {
  28. if (from != kNowhere) {
  29. result += to - from;
  30. }
  31. from = i;
  32. }
  33. to = i + sliceLen;
  34. }
  35. }
  36. }
  37. if (from != kNowhere) {
  38. result += to - from;
  39. }
  40. return result;
  41. }
  42. static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
  43. Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
  44. SampleIdx minPresence, SampleIdx iteration) {
  45. std::string output;
  46. TextIdx from = kNowhere;
  47. TextIdx to = kNowhere;
  48. SampleIdx targetPresence = minPresence;
  49. for (TextIdx i = 0; i < end; ++i) {
  50. if (i == middle) {
  51. targetPresence++;
  52. }
  53. Slot& item = map[shortcut[i]];
  54. if (item.mark != iteration) {
  55. item.mark = iteration;
  56. if (item.presence >= targetPresence) {
  57. if ((to == kNowhere) || (to < i)) {
  58. if (from != kNowhere) {
  59. output.insert(output.end(), &data[from], &data[to]);
  60. }
  61. from = i;
  62. }
  63. to = i + sliceLen;
  64. }
  65. }
  66. }
  67. if (from != kNowhere) {
  68. output.insert(output.end(), &data[from], &data[to]);
  69. }
  70. return output;
  71. }
  72. std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
  73. const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
  74. /* Parameters aliasing */
  75. TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
  76. if (targetSize != dictionary_size_limit) {
  77. fprintf(stderr, "dictionary_size_limit is too large\n");
  78. return "";
  79. }
  80. TextIdx sliceLen = static_cast<TextIdx>(slice_len);
  81. if (sliceLen != slice_len) {
  82. fprintf(stderr, "slice_len is too large\n");
  83. return "";
  84. }
  85. if (sliceLen < 1) {
  86. fprintf(stderr, "slice_len is too small\n");
  87. return "";
  88. }
  89. SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
  90. if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
  91. fprintf(stderr, "too many samples\n");
  92. return "";
  93. }
  94. const uint8_t* data = sample_data;
  95. TextIdx total = 0;
  96. std::vector<TextIdx> offsets;
  97. for (SampleIdx i = 0; i < numSamples; ++i) {
  98. TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
  99. if (delta != sample_sizes[i]) {
  100. fprintf(stderr, "sample is too large\n");
  101. return "";
  102. }
  103. if (delta == 0) {
  104. fprintf(stderr, "empty samples are prohibited\n");
  105. return "";
  106. }
  107. if (total + delta <= total) {
  108. fprintf(stderr, "corpus is too large\n");
  109. return "";
  110. }
  111. total += delta;
  112. offsets.push_back(total);
  113. }
  114. if (total * 2 < total) {
  115. fprintf(stderr, "corpus is too large\n");
  116. return "";
  117. }
  118. if (total < sliceLen) {
  119. fprintf(stderr, "slice_len is larger than corpus size\n");
  120. return "";
  121. }
  122. /*****************************************************************************
  123. * Build coverage map.
  124. ****************************************************************************/
  125. std::vector<Slot> map;
  126. std::vector<TextIdx> shortcut;
  127. map.push_back({0, 0, 0, 0});
  128. TextIdx end = total - sliceLen;
  129. TextIdx hashLen = 11;
  130. while (hashLen < 29 && ((1u << hashLen) < end)) {
  131. hashLen += 3;
  132. }
  133. hashLen -= 3;
  134. TextIdx hashMask = (1u << hashLen) - 1u;
  135. std::vector<TextIdx> hashHead(1 << hashLen);
  136. TextIdx hashSlot = 1;
  137. SampleIdx piece = 0;
  138. TextIdx hash = 0;
  139. TextIdx lShift = 3;
  140. TextIdx rShift = hashLen - lShift;
  141. for (TextIdx i = 0; i < sliceLen - 1; ++i) {
  142. TextIdx v = data[i];
  143. hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
  144. }
  145. TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
  146. TextIdx rShiftX = hashLen - lShiftX;
  147. for (TextIdx i = 0; i < end; ++i) {
  148. TextIdx v = data[i + sliceLen - 1];
  149. hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
  150. if (offsets[piece] == i) {
  151. piece++;
  152. }
  153. TextIdx slot = hashHead[hash];
  154. while (slot != 0) {
  155. Slot& item = map[slot];
  156. TextIdx start = item.offset;
  157. bool miss = false;
  158. for (TextIdx j = 0; j < sliceLen; ++j) {
  159. if (data[i + j] != data[start + j]) {
  160. miss = true;
  161. break;
  162. }
  163. }
  164. if (!miss) {
  165. if (item.mark != piece) {
  166. item.mark = piece;
  167. item.presence++;
  168. }
  169. shortcut.push_back(slot);
  170. break;
  171. }
  172. slot = item.next;
  173. }
  174. if (slot == 0) {
  175. map.push_back({hashHead[hash], i, 1, piece});
  176. hashHead[hash] = hashSlot;
  177. shortcut.push_back(hashSlot);
  178. hashSlot++;
  179. }
  180. v = data[i];
  181. hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
  182. }
  183. /*****************************************************************************
  184. * Build dictionary of specified size.
  185. ****************************************************************************/
  186. SampleIdx a = 1;
  187. TextIdx size = dryRun(
  188. sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
  189. /* Maximal output is smaller than target. */
  190. if (size <= targetSize) {
  191. return createDictionary(
  192. data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
  193. }
  194. SampleIdx b = numSamples;
  195. size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
  196. if (size == targetSize) {
  197. return createDictionary(
  198. data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
  199. }
  200. /* Run binary search. */
  201. if (size < targetSize) {
  202. /* size(a) > targetSize > size(b) && a < m < b */
  203. while (a + 1 < b) {
  204. SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
  205. size = dryRun(
  206. sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
  207. if (size < targetSize) {
  208. b = m;
  209. } else if (size > targetSize) {
  210. a = m;
  211. } else {
  212. return createDictionary(
  213. data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
  214. }
  215. }
  216. } else {
  217. a = b;
  218. }
  219. /* size(minPresence) > targetSize > size(minPresence + 1) */
  220. SampleIdx minPresence = a;
  221. TextIdx c = 0;
  222. TextIdx d = end;
  223. /* size(a) < targetSize < size(b) && a < m < b */
  224. while (c + 1 < d) {
  225. TextIdx m = (c + d) / 2;
  226. size = dryRun(
  227. sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
  228. if (size < targetSize) {
  229. c = m;
  230. } else if (size > targetSize) {
  231. d = m;
  232. } else {
  233. return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
  234. m, minPresence, ++piece);
  235. }
  236. }
  237. bool unrestricted = false;
  238. if (minPresence <= 2 && !unrestricted) {
  239. minPresence = 2;
  240. c = end;
  241. }
  242. return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
  243. minPresence, ++piece);
  244. }