durchschlag.h 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_
  2. #define BROTLI_RESEARCH_DURCHSCHLAG_H_
  3. #include <cstddef>
  4. #include <cstdint>
  5. #include <string>
  6. #include <vector>
  7. /**
  8. * Generate a dictionary for given samples.
  9. *
  10. * @param dictionary_size_limit maximal dictionary size
  11. * @param slice_len text slice size
  12. * @param block_len score block length
  13. * @param sample_sizes vector with sample sizes
  14. * @param sample_data concatenated samples
  15. * @return generated dictionary
  16. */
  17. std::string durchschlag_generate(
  18. size_t dictionary_size_limit, size_t slice_len, size_t block_len,
  19. const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
  20. //------------------------------------------------------------------------------
  21. // Lower level API for repetitive dictionary generation.
  22. //------------------------------------------------------------------------------
  23. /* Pointer to position in text. */
  24. typedef uint32_t DurchschlagTextIdx;
  25. /* Context is made public for flexible serialization / deserialization. */
  26. typedef struct DurchschlagContext {
  27. DurchschlagTextIdx dataSize;
  28. DurchschlagTextIdx sliceLen;
  29. DurchschlagTextIdx numUniqueSlices;
  30. std::vector<DurchschlagTextIdx> offsets;
  31. std::vector<DurchschlagTextIdx> sliceMap;
  32. } DurchschlagContext;
  33. DurchschlagContext durchschlag_prepare(size_t slice_len,
  34. const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
  35. typedef enum DurchschalgResourceStrategy {
  36. // Faster
  37. DURCHSCHLAG_EXCLUSIVE = 0,
  38. // Uses much less memory
  39. DURCHSCHLAG_COLLABORATIVE = 1
  40. } DurchschalgResourceStrategy;
  41. std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
  42. size_t dictionary_size_limit, size_t block_len,
  43. const DurchschlagContext& context, const uint8_t* sample_data);
  44. //------------------------------------------------------------------------------
  45. // Suffix Array based preparation.
  46. //------------------------------------------------------------------------------
  47. typedef struct DurchschlagIndex {
  48. std::vector<DurchschlagTextIdx> lcp;
  49. std::vector<DurchschlagTextIdx> sa;
  50. } DurchschlagIndex;
  51. DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data);
  52. DurchschlagContext durchschlag_prepare(size_t slice_len,
  53. const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index);
  54. //------------------------------------------------------------------------------
  55. // Data preparation.
  56. //------------------------------------------------------------------------------
  57. /**
  58. * Cut out unique slices.
  59. *
  60. * Both @p sample_sizes and @p sample_data are modified in-place. Number of
  61. * samples remains unchanged, but some samples become shorter.
  62. *
  63. * @param slice_len (unique) slice size
  64. * @param minimum_population minimum non-unique slice occurrence
  65. * @param sample_sizes [in / out] vector with sample sizes
  66. * @param sample_data [in / out] concatenated samples
  67. */
  68. void durchschlag_distill(size_t slice_len, size_t minimum_population,
  69. std::vector<size_t>* sample_sizes, uint8_t* sample_data);
  70. /**
  71. * Replace unique slices with zeroes.
  72. *
  73. * @p sample_data is modified in-place. Number of samples and their length
  74. * remain unchanged.
  75. *
  76. * @param slice_len (unique) slice size
  77. * @param minimum_population minimum non-unique slice occurrence
  78. * @param sample_sizes vector with sample sizes
  79. * @param sample_data [in / out] concatenated samples
  80. */
  81. void durchschlag_purify(size_t slice_len, size_t minimum_population,
  82. const std::vector<size_t>& sample_sizes, uint8_t* sample_data);
  83. #endif // BROTLI_RESEARCH_DURCHSCHLAG_H_