| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- #include "sieve.h"
- /* Pointer to position in (combined corpus) text. */
- typedef uint32_t TextIdx;
- /* Index of sample / generation. */
- typedef uint16_t SampleIdx;
- typedef struct Slot {
- TextIdx next;
- TextIdx offset;
- SampleIdx presence;
- SampleIdx mark;
- } Slot;
- static const TextIdx kNowhere = static_cast<TextIdx>(-1);
- static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
- TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
- TextIdx from = kNowhere;
- TextIdx to = kNowhere;
- TextIdx result = 0;
- SampleIdx targetPresence = minPresence;
- for (TextIdx i = 0; i < end; ++i) {
- if (i == middle) {
- targetPresence++;
- }
- Slot& item = map[shortcut[i]];
- if (item.mark != iteration) {
- item.mark = iteration;
- if (item.presence >= targetPresence) {
- if ((to == kNowhere) || (to < i)) {
- if (from != kNowhere) {
- result += to - from;
- }
- from = i;
- }
- to = i + sliceLen;
- }
- }
- }
- if (from != kNowhere) {
- result += to - from;
- }
- return result;
- }
- static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
- Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
- SampleIdx minPresence, SampleIdx iteration) {
- std::string output;
- TextIdx from = kNowhere;
- TextIdx to = kNowhere;
- SampleIdx targetPresence = minPresence;
- for (TextIdx i = 0; i < end; ++i) {
- if (i == middle) {
- targetPresence++;
- }
- Slot& item = map[shortcut[i]];
- if (item.mark != iteration) {
- item.mark = iteration;
- if (item.presence >= targetPresence) {
- if ((to == kNowhere) || (to < i)) {
- if (from != kNowhere) {
- output.insert(output.end(), &data[from], &data[to]);
- }
- from = i;
- }
- to = i + sliceLen;
- }
- }
- }
- if (from != kNowhere) {
- output.insert(output.end(), &data[from], &data[to]);
- }
- return output;
- }
- std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
- const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
- /* Parameters aliasing */
- TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
- if (targetSize != dictionary_size_limit) {
- fprintf(stderr, "dictionary_size_limit is too large\n");
- return "";
- }
- TextIdx sliceLen = static_cast<TextIdx>(slice_len);
- if (sliceLen != slice_len) {
- fprintf(stderr, "slice_len is too large\n");
- return "";
- }
- if (sliceLen < 1) {
- fprintf(stderr, "slice_len is too small\n");
- return "";
- }
- SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
- if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
- fprintf(stderr, "too many samples\n");
- return "";
- }
- const uint8_t* data = sample_data;
- TextIdx total = 0;
- std::vector<TextIdx> offsets;
- for (SampleIdx i = 0; i < numSamples; ++i) {
- TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
- if (delta != sample_sizes[i]) {
- fprintf(stderr, "sample is too large\n");
- return "";
- }
- if (delta == 0) {
- fprintf(stderr, "empty samples are prohibited\n");
- return "";
- }
- if (total + delta <= total) {
- fprintf(stderr, "corpus is too large\n");
- return "";
- }
- total += delta;
- offsets.push_back(total);
- }
- if (total * 2 < total) {
- fprintf(stderr, "corpus is too large\n");
- return "";
- }
- if (total < sliceLen) {
- fprintf(stderr, "slice_len is larger than corpus size\n");
- return "";
- }
- /*****************************************************************************
- * Build coverage map.
- ****************************************************************************/
- std::vector<Slot> map;
- std::vector<TextIdx> shortcut;
- map.push_back({0, 0, 0, 0});
- TextIdx end = total - sliceLen;
- TextIdx hashLen = 11;
- while (hashLen < 29 && ((1u << hashLen) < end)) {
- hashLen += 3;
- }
- hashLen -= 3;
- TextIdx hashMask = (1u << hashLen) - 1u;
- std::vector<TextIdx> hashHead(1 << hashLen);
- TextIdx hashSlot = 1;
- SampleIdx piece = 0;
- TextIdx hash = 0;
- TextIdx lShift = 3;
- TextIdx rShift = hashLen - lShift;
- for (TextIdx i = 0; i < sliceLen - 1; ++i) {
- TextIdx v = data[i];
- hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
- }
- TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
- TextIdx rShiftX = hashLen - lShiftX;
- for (TextIdx i = 0; i < end; ++i) {
- TextIdx v = data[i + sliceLen - 1];
- hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
- if (offsets[piece] == i) {
- piece++;
- }
- TextIdx slot = hashHead[hash];
- while (slot != 0) {
- Slot& item = map[slot];
- TextIdx start = item.offset;
- bool miss = false;
- for (TextIdx j = 0; j < sliceLen; ++j) {
- if (data[i + j] != data[start + j]) {
- miss = true;
- break;
- }
- }
- if (!miss) {
- if (item.mark != piece) {
- item.mark = piece;
- item.presence++;
- }
- shortcut.push_back(slot);
- break;
- }
- slot = item.next;
- }
- if (slot == 0) {
- map.push_back({hashHead[hash], i, 1, piece});
- hashHead[hash] = hashSlot;
- shortcut.push_back(hashSlot);
- hashSlot++;
- }
- v = data[i];
- hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
- }
- /*****************************************************************************
- * Build dictionary of specified size.
- ****************************************************************************/
- SampleIdx a = 1;
- TextIdx size = dryRun(
- sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
- /* Maximal output is smaller than target. */
- if (size <= targetSize) {
- return createDictionary(
- data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
- }
- SampleIdx b = numSamples;
- size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
- if (size == targetSize) {
- return createDictionary(
- data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
- }
- /* Run binary search. */
- if (size < targetSize) {
- /* size(a) > targetSize > size(b) && a < m < b */
- while (a + 1 < b) {
- SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
- size = dryRun(
- sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
- if (size < targetSize) {
- b = m;
- } else if (size > targetSize) {
- a = m;
- } else {
- return createDictionary(
- data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
- }
- }
- } else {
- a = b;
- }
- /* size(minPresence) > targetSize > size(minPresence + 1) */
- SampleIdx minPresence = a;
- TextIdx c = 0;
- TextIdx d = end;
- /* size(a) < targetSize < size(b) && a < m < b */
- while (c + 1 < d) {
- TextIdx m = (c + d) / 2;
- size = dryRun(
- sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
- if (size < targetSize) {
- c = m;
- } else if (size > targetSize) {
- d = m;
- } else {
- return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
- m, minPresence, ++piece);
- }
- }
- bool unrestricted = false;
- if (minPresence <= 2 && !unrestricted) {
- minPresence = 2;
- c = end;
- }
- return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
- minPresence, ++piece);
- }
|