| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726 |
- #include "durchschlag.h"
- #include <algorithm>
- #include <exception> /* terminate */
- #include <divsufsort.h>
- /* Pointer to position in text. */
- typedef DurchschlagTextIdx TextIdx;
- /* (Sum of) value(s) of slice(s). */
- typedef uint32_t Score;
- typedef struct HashSlot {
- TextIdx next;
- TextIdx offset;
- } HashSlot;
- typedef struct MetaSlot {
- TextIdx mark;
- Score score;
- } MetaSlot;
- typedef struct Range {
- TextIdx start;
- TextIdx end;
- } Range;
- typedef struct Candidate {
- Score score;
- TextIdx position;
- } Candidate;
- struct greaterScore {
- bool operator()(const Candidate& a, const Candidate& b) const {
- return (a.score > b.score) ||
- ((a.score == b.score) && (a.position < b.position));
- }
- };
- struct lessScore {
- bool operator()(const Candidate& a, const Candidate& b) const {
- return (a.score < b.score) ||
- ((a.score == b.score) && (a.position > b.position));
- }
- };
- #define CANDIDATE_BUNDLE_SIZE (1 << 18)
- static void fatal(const char* error) {
- fprintf(stderr, "%s\n", error);
- std::terminate();
- }
- static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
- TextIdx result = 0;
- for (size_t i = 0; i < ranges.size(); ++i) {
- const Range& r = ranges[i];
- result += r.end - r.start;
- }
- return result;
- }
- static std::string createDictionary(
- const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
- std::string output;
- output.reserve(calculateDictionarySize(ranges));
- for (size_t i = 0; i < ranges.size(); ++i) {
- const Range& r = ranges[i];
- output.insert(output.end(), &data[r.start], &data[r.end]);
- }
- if (output.size() > limit) {
- output.resize(limit);
- }
- return output;
- }
- /* precondition: span > 0
- precondition: end + span == len(shortcut) */
- static Score buildCandidatesList(std::vector<Candidate>* candidates,
- std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
- TextIdx end) {
- candidates->resize(0);
- size_t n = map->size();
- MetaSlot* slots = map->data();
- for (size_t j = 0; j < n; ++j) {
- slots[j].mark = 0;
- }
- Score score = 0;
- /* Consider the whole span, except one last item. The following loop will
- add the last item to the end of the "chain", evaluate it, and cut one
- "link" form the beginning. */
- for (size_t j = 0; j < span - 1; ++j) {
- MetaSlot& item = slots[shortcut[j]];
- if (item.mark == 0) {
- score += item.score;
- }
- item.mark++;
- }
- TextIdx i = 0;
- TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
- Score maxScore = 0;
- for (; i < limit; ++i) {
- TextIdx slice = shortcut[i + span - 1];
- MetaSlot& pick = slots[slice];
- if (pick.mark == 0) {
- score += pick.score;
- }
- pick.mark++;
- if (score > maxScore) {
- maxScore = score;
- }
- candidates->push_back({score, i});
- MetaSlot& drop = slots[shortcut[i]];
- drop.mark--;
- if (drop.mark == 0) {
- score -= drop.score;
- }
- }
- std::make_heap(candidates->begin(), candidates->end(), greaterScore());
- Score minScore = candidates->at(0).score;
- for (; i < end; ++i) {
- TextIdx slice = shortcut[i + span - 1];
- MetaSlot& pick = slots[slice];
- if (pick.mark == 0) {
- score += pick.score;
- }
- pick.mark++;
- if (score > maxScore) {
- maxScore = score;
- }
- if (score >= minScore) {
- candidates->push_back({score, i});
- std::push_heap(candidates->begin(), candidates->end(), greaterScore());
- if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
- while (candidates->at(0).score == minScore) {
- std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
- candidates->pop_back();
- }
- minScore = candidates->at(0).score;
- }
- }
- MetaSlot& drop = slots[shortcut[i]];
- drop.mark--;
- if (drop.mark == 0) {
- score -= drop.score;
- }
- }
- for (size_t j = 0; j < n; ++j) {
- slots[j].mark = 0;
- }
- std::make_heap(candidates->begin(), candidates->end(), lessScore());
- return minScore;
- }
- /* precondition: span > 0
- precondition: end + span == len(shortcut) */
- static Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
- std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
- TextIdx end, TextIdx* next) {
- size_t n = candidates->size();
- TextIdx* data = candidates->data();
- for (size_t i = 0; i < n; ++i) {
- data[i] = 0;
- }
- n = map->size();
- MetaSlot* slots = map->data();
- for (size_t i = 0; i < n; ++i) {
- slots[i].mark = 0;
- }
- Score score = 0;
- /* Consider the whole span, except one last item. The following loop will
- add the last item to the end of the "chain", evaluate it, and cut one
- "link" form the beginning. */
- for (TextIdx i = 0; i < span - 1; ++i) {
- MetaSlot& item = slots[shortcut[i]];
- if (item.mark == 0) {
- score += item.score;
- }
- item.mark++;
- }
- Score maxScore = 0;
- for (TextIdx i = 0; i < end; ++i) {
- MetaSlot& pick = slots[shortcut[i + span - 1]];
- if (pick.mark == 0) {
- score += pick.score;
- }
- pick.mark++;
- if (candidates->size() <= score) {
- candidates->resize(score + 1);
- }
- if (score > maxScore) {
- maxScore = score;
- }
- next[i] = candidates->at(score);
- candidates->at(score) = i;
- MetaSlot& drop = slots[shortcut[i]];
- drop.mark--;
- if (drop.mark == 0) {
- score -= drop.score;
- }
- }
- for (size_t i = 0; i < n; ++i) {
- slots[i].mark = 0;
- }
- candidates->resize(maxScore + 1);
- return maxScore;
- }
- static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
- for (auto it = ranges->begin(); it != ranges->end();) {
- if (end < it->start) {
- ranges->insert(it, {start, end});
- return;
- }
- if (it->end < start) {
- it++;
- continue;
- }
- // Combine with existing.
- start = std::min(start, it->start);
- end = std::max(end, it->end);
- // Remove consumed vector and continue.
- it = ranges->erase(it);
- }
- ranges->push_back({start, end});
- }
- std::string durchschlag_generate(
- size_t dictionary_size_limit, size_t slice_len, size_t block_len,
- const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
- DurchschlagContext ctx = durchschlag_prepare(
- slice_len, sample_sizes, sample_data);
- return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
- dictionary_size_limit, block_len, ctx, sample_data);
- }
- DurchschlagContext durchschlag_prepare(size_t slice_len,
- const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
- /* Parameters aliasing */
- TextIdx sliceLen = static_cast<TextIdx>(slice_len);
- if (sliceLen != slice_len) fatal("slice_len is too large");
- if (sliceLen < 1) fatal("slice_len is too small");
- const uint8_t* data = sample_data;
- TextIdx total = 0;
- std::vector<TextIdx> offsets;
- offsets.reserve(sample_sizes.size());
- for (size_t i = 0; i < sample_sizes.size(); ++i) {
- TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
- if (delta != sample_sizes[i]) fatal("sample is too large");
- if (delta == 0) fatal("0-length samples are prohibited");
- TextIdx next_total = total + delta;
- if (next_total <= total) fatal("corpus is too large");
- total = next_total;
- offsets.push_back(total);
- }
- if (total < sliceLen) fatal("slice_len is larger than corpus size");
- TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
- TextIdx hashLen = 11;
- while (hashLen < 29 && ((1u << hashLen) < end)) {
- hashLen += 3;
- }
- hashLen -= 3;
- TextIdx hashMask = (1u << hashLen) - 1u;
- std::vector<TextIdx> hashHead(1 << hashLen);
- TextIdx hash = 0;
- TextIdx lShift = 3;
- TextIdx rShift = hashLen - lShift;
- for (TextIdx i = 0; i < sliceLen - 1; ++i) {
- TextIdx v = data[i];
- hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
- }
- TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
- TextIdx rShiftX = hashLen - lShiftX;
- std::vector<HashSlot> map;
- map.push_back({0, 0});
- TextIdx hashSlot = 1;
- std::vector<TextIdx> sliceMap;
- sliceMap.reserve(end);
- for (TextIdx i = 0; i < end; ++i) {
- TextIdx v = data[i + sliceLen - 1];
- TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
- v = data[i];
- hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
- TextIdx slot = hashHead[bucket];
- while (slot != 0) {
- HashSlot& item = map[slot];
- TextIdx start = item.offset;
- bool miss = false;
- for (TextIdx j = 0; j < sliceLen; ++j) {
- if (data[i + j] != data[start + j]) {
- miss = true;
- break;
- }
- }
- if (!miss) {
- sliceMap.push_back(slot);
- break;
- }
- slot = item.next;
- }
- if (slot == 0) {
- map.push_back({hashHead[bucket], i});
- hashHead[bucket] = hashSlot;
- sliceMap.push_back(hashSlot);
- hashSlot++;
- }
- }
- return {total, sliceLen, static_cast<TextIdx>(map.size()),
- std::move(offsets), std::move(sliceMap)};
- }
- DurchschlagContext durchschlag_prepare(size_t slice_len,
- const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
- /* Parameters aliasing */
- TextIdx sliceLen = static_cast<TextIdx>(slice_len);
- if (sliceLen != slice_len) fatal("slice_len is too large");
- if (sliceLen < 1) fatal("slice_len is too small");
- const TextIdx* lcp = index.lcp.data();
- const TextIdx* sa = index.sa.data();
- TextIdx total = 0;
- std::vector<TextIdx> offsets;
- offsets.reserve(sample_sizes.size());
- for (size_t i = 0; i < sample_sizes.size(); ++i) {
- TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
- if (delta != sample_sizes[i]) fatal("sample is too large");
- if (delta == 0) fatal("0-length samples are prohibited");
- TextIdx next_total = total + delta;
- if (next_total <= total) fatal("corpus is too large");
- total = next_total;
- offsets.push_back(total);
- }
- if (total < sliceLen) fatal("slice_len is larger than corpus size");
- TextIdx counter = 1;
- TextIdx end = total - sliceLen + 1;
- std::vector<TextIdx> sliceMap(total);
- TextIdx last = 0;
- TextIdx current = 1;
- while (current <= total) {
- if (lcp[current - 1] < sliceLen) {
- for (TextIdx i = last; i < current; ++i) {
- sliceMap[sa[i]] = counter;
- }
- counter++;
- last = current;
- }
- current++;
- }
- sliceMap.resize(end);
- // Reorder items for the better locality.
- std::vector<TextIdx> reorder(counter);
- counter = 1;
- for (TextIdx i = 0; i < end; ++i) {
- if (reorder[sliceMap[i]] == 0) {
- reorder[sliceMap[i]] = counter++;
- }
- }
- for (TextIdx i = 0; i < end; ++i) {
- sliceMap[i] = reorder[sliceMap[i]];
- }
- return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
- }
- DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
- TextIdx total = static_cast<TextIdx>(data.size());
- if (total != data.size()) fatal("corpus is too large");
- saidx_t saTotal = static_cast<saidx_t>(total);
- if (saTotal < 0) fatal("corpus is too large");
- if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
- std::vector<TextIdx> sa(total);
- /* Hopefully, non-negative int32_t values match TextIdx ones. */
- if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
- int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
- divsufsort(data.data(), saData, saTotal);
- std::vector<TextIdx> isa(total);
- for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
- // TODO(eustas): borrowed -> unknown efficiency.
- std::vector<TextIdx> lcp(total);
- TextIdx k = 0;
- lcp[total - 1] = 0;
- for (TextIdx i = 0; i < total; ++i) {
- TextIdx current = isa[i];
- if (current == total - 1) {
- k = 0;
- continue;
- }
- TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix.
- while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
- ++k;
- }
- lcp[current] = k;
- if (k > 0) --k;
- }
- return {std::move(lcp), std::move(sa)};
- }
- static void ScoreSlices(const std::vector<TextIdx>& offsets,
- std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
- TextIdx piece = 0;
- /* Fresh map contains all zeroes -> initial mark should be different. */
- TextIdx mark = 1;
- for (TextIdx i = 0; i < end; ++i) {
- if (offsets[piece] == i) {
- piece++;
- mark++;
- }
- MetaSlot& item = map[shortcut[i]];
- if (item.mark != mark) {
- item.mark = mark;
- item.score++;
- }
- }
- }
- static std::string durchschlagGenerateExclusive(
- size_t dictionary_size_limit, size_t block_len,
- const DurchschlagContext& context, const uint8_t* sample_data) {
- /* Parameters aliasing */
- TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
- if (targetSize != dictionary_size_limit) {
- fprintf(stderr, "dictionary_size_limit is too large\n");
- return "";
- }
- TextIdx sliceLen = context.sliceLen;
- TextIdx total = context.dataSize;
- TextIdx blockLen = static_cast<TextIdx>(block_len);
- if (blockLen != block_len) {
- fprintf(stderr, "block_len is too large\n");
- return "";
- }
- const uint8_t* data = sample_data;
- const std::vector<TextIdx>& offsets = context.offsets;
- std::vector<MetaSlot> map(context.numUniqueSlices);
- const TextIdx* shortcut = context.sliceMap.data();
- /* Initialization */
- if (blockLen < sliceLen) {
- fprintf(stderr, "sliceLen is larger than block_len\n");
- return "";
- }
- if (targetSize < blockLen || total < blockLen) {
- fprintf(stderr, "block_len is too large\n");
- return "";
- }
- TextIdx end = total - sliceLen + 1;
- ScoreSlices(offsets, map, shortcut, end);
- TextIdx span = blockLen - sliceLen + 1;
- end = static_cast<TextIdx>(context.sliceMap.size()) - span;
- std::vector<TextIdx> candidates;
- std::vector<TextIdx> next(end);
- Score maxScore = rebuildCandidatesList(
- &candidates, &map, span, shortcut, end, next.data());
- /* Block selection */
- const size_t triesLimit = (600 * 1000000) / span;
- const size_t candidatesLimit = (150 * 1000000) / span;
- std::vector<Range> ranges;
- TextIdx mark = 0;
- size_t numTries = 0;
- while (true) {
- TextIdx dictSize = calculateDictionarySize(ranges);
- size_t numCandidates = 0;
- if (dictSize > targetSize - blockLen) {
- break;
- }
- if (maxScore == 0) {
- break;
- }
- while (true) {
- TextIdx candidate = 0;
- while (maxScore > 0) {
- if (candidates[maxScore] != 0) {
- candidate = candidates[maxScore];
- candidates[maxScore] = next[candidate];
- break;
- }
- maxScore--;
- }
- if (maxScore == 0) {
- break;
- }
- mark++;
- numTries++;
- numCandidates++;
- Score score = 0;
- for (size_t j = candidate; j < candidate + span; ++j) {
- MetaSlot& item = map[shortcut[j]];
- if (item.mark != mark) {
- score += item.score;
- item.mark = mark;
- }
- }
- if (score < maxScore) {
- if (numTries < triesLimit && numCandidates < candidatesLimit) {
- next[candidate] = candidates[score];
- candidates[score] = candidate;
- } else {
- maxScore = rebuildCandidatesList(
- &candidates, &map, span, shortcut, end, next.data());
- mark = 0;
- numTries = 0;
- numCandidates = 0;
- }
- continue;
- } else if (score > maxScore) {
- fprintf(stderr, "Broken invariant\n");
- return "";
- }
- for (TextIdx j = candidate; j < candidate + span; ++j) {
- MetaSlot& item = map[shortcut[j]];
- item.score = 0;
- }
- addRange(&ranges, candidate, candidate + blockLen);
- break;
- }
- }
- return createDictionary(data, ranges, targetSize);
- }
- static std::string durchschlagGenerateCollaborative(
- size_t dictionary_size_limit, size_t block_len,
- const DurchschlagContext& context, const uint8_t* sample_data) {
- /* Parameters aliasing */
- TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
- if (targetSize != dictionary_size_limit) {
- fprintf(stderr, "dictionary_size_limit is too large\n");
- return "";
- }
- TextIdx sliceLen = context.sliceLen;
- TextIdx total = context.dataSize;
- TextIdx blockLen = static_cast<TextIdx>(block_len);
- if (blockLen != block_len) {
- fprintf(stderr, "block_len is too large\n");
- return "";
- }
- const uint8_t* data = sample_data;
- const std::vector<TextIdx>& offsets = context.offsets;
- std::vector<MetaSlot> map(context.numUniqueSlices);
- const TextIdx* shortcut = context.sliceMap.data();
- /* Initialization */
- if (blockLen < sliceLen) {
- fprintf(stderr, "sliceLen is larger than block_len\n");
- return "";
- }
- if (targetSize < blockLen || total < blockLen) {
- fprintf(stderr, "block_len is too large\n");
- return "";
- }
- TextIdx end = total - sliceLen + 1;
- ScoreSlices(offsets, map, shortcut, end);
- TextIdx span = blockLen - sliceLen + 1;
- end = static_cast<TextIdx>(context.sliceMap.size()) - span;
- std::vector<Candidate> candidates;
- candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
- Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
- /* Block selection */
- std::vector<Range> ranges;
- TextIdx mark = 0;
- while (true) {
- TextIdx dictSize = calculateDictionarySize(ranges);
- if (dictSize > targetSize - blockLen) {
- break;
- }
- if (minScore == 0 && candidates.empty()) {
- break;
- }
- while (true) {
- if (candidates.empty()) {
- minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
- mark = 0;
- }
- TextIdx candidate = candidates[0].position;
- Score expectedScore = candidates[0].score;
- if (expectedScore == 0) {
- candidates.resize(0);
- break;
- }
- std::pop_heap(candidates.begin(), candidates.end(), lessScore());
- candidates.pop_back();
- mark++;
- Score score = 0;
- for (TextIdx j = candidate; j < candidate + span; ++j) {
- MetaSlot& item = map[shortcut[j]];
- if (item.mark != mark) {
- score += item.score;
- item.mark = mark;
- }
- }
- if (score < expectedScore) {
- if (score >= minScore) {
- candidates.push_back({score, candidate});
- std::push_heap(candidates.begin(), candidates.end(), lessScore());
- }
- continue;
- } else if (score > expectedScore) {
- fatal("Broken invariant");
- }
- for (TextIdx j = candidate; j < candidate + span; ++j) {
- MetaSlot& item = map[shortcut[j]];
- item.score = 0;
- }
- addRange(&ranges, candidate, candidate + blockLen);
- break;
- }
- }
- return createDictionary(data, ranges, targetSize);
- }
- std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
- size_t dictionary_size_limit, size_t block_len,
- const DurchschlagContext& context, const uint8_t* sample_data) {
- if (strategy == DURCHSCHLAG_COLLABORATIVE) {
- return durchschlagGenerateCollaborative(
- dictionary_size_limit, block_len, context, sample_data);
- } else {
- return durchschlagGenerateExclusive(
- dictionary_size_limit, block_len, context, sample_data);
- }
- }
- void durchschlag_distill(size_t slice_len, size_t minimum_population,
- std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
- /* Parameters aliasing */
- uint8_t* data = sample_data;
- /* Build slice map. */
- DurchschlagContext context = durchschlag_prepare(
- slice_len, *sample_sizes, data);
- /* Calculate slice population. */
- const std::vector<TextIdx>& offsets = context.offsets;
- std::vector<MetaSlot> map(context.numUniqueSlices);
- const TextIdx* shortcut = context.sliceMap.data();
- TextIdx sliceLen = context.sliceLen;
- TextIdx total = context.dataSize;
- TextIdx end = total - sliceLen + 1;
- ScoreSlices(offsets, map, shortcut, end);
- /* Condense samples, omitting unique slices. */
- TextIdx readPos = 0;
- TextIdx writePos = 0;
- TextIdx lastNonUniquePos = 0;
- for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
- TextIdx sampleStart = writePos;
- TextIdx oldSampleEnd =
- readPos + static_cast<TextIdx>(sample_sizes->at(i));
- while (readPos < oldSampleEnd) {
- if (readPos < end) {
- MetaSlot& item = map[shortcut[readPos]];
- if (item.score >= minimum_population) {
- lastNonUniquePos = readPos + sliceLen;
- }
- }
- if (readPos < lastNonUniquePos) {
- data[writePos++] = data[readPos];
- }
- readPos++;
- }
- sample_sizes->at(i) = writePos - sampleStart;
- }
- }
- void durchschlag_purify(size_t slice_len, size_t minimum_population,
- const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
- /* Parameters aliasing */
- uint8_t* data = sample_data;
- /* Build slice map. */
- DurchschlagContext context = durchschlag_prepare(
- slice_len, sample_sizes, data);
- /* Calculate slice population. */
- const std::vector<TextIdx>& offsets = context.offsets;
- std::vector<MetaSlot> map(context.numUniqueSlices);
- const TextIdx* shortcut = context.sliceMap.data();
- TextIdx sliceLen = context.sliceLen;
- TextIdx total = context.dataSize;
- TextIdx end = total - sliceLen + 1;
- ScoreSlices(offsets, map, shortcut, end);
- /* Rewrite samples, zeroing out unique slices. */
- TextIdx lastNonUniquePos = 0;
- for (TextIdx readPos = 0; readPos < total; ++readPos) {
- if (readPos < end) {
- MetaSlot& item = map[shortcut[readPos]];
- if (item.score >= minimum_population) {
- lastNonUniquePos = readPos + sliceLen;
- }
- }
- if (readPos >= lastNonUniquePos) {
- data[readPos] = 0;
- }
- }
- }
|