dictionary_generator.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. #include <climits>
  2. #include <cstddef>
  3. #include <cstdio>
  4. #include <cstring>
  5. #include <fstream>
  6. #if !defined(_MSC_VER)
  7. #include <glob.h>
  8. #endif
  9. #include <vector>
  10. #include "deorummolae.h"
  11. #include "durchschlag.h"
  12. #include "sieve.h"
  13. /* This isn't a definitive list of "--foo" arguments, only those that take an
  14. * additional "=#" integer parameter, like "--foo=20" or "--foo=32K".
  15. */
  16. #define LONG_ARG_BLOCK_LEN "--block_len="
  17. #define LONG_ARG_SLICE_LEN "--slice_len="
  18. #define LONG_ARG_TARGET_DICT_LEN "--target_dict_len="
  19. #define LONG_ARG_MIN_SLICE_POP "--min_slice_pop="
  20. #define LONG_ARG_CHUNK_LEN "--chunk_len="
  21. #define LONG_ARG_OVERLAP_LEN "--overlap_len="
  22. #define METHOD_DM 0
  23. #define METHOD_SIEVE 1
  24. #define METHOD_DURCHSCHLAG 2
  25. #define METHOD_DISTILL 3
  26. #define METHOD_PURIFY 4
  27. static size_t readInt(const char* str) {
  28. size_t result = 0;
  29. if (str[0] == 0 || str[0] == '0') {
  30. return 0;
  31. }
  32. for (size_t i = 0; i < 13; ++i) {
  33. if (str[i] == 0) {
  34. return result;
  35. }
  36. if (str[i] == 'k' || str[i] == 'K') {
  37. if ((str[i + 1] == 0) && ((result << 10) > result)) {
  38. return result << 10;
  39. }
  40. return 0;
  41. }
  42. if (str[i] == 'm' || str[i] == 'M') {
  43. if ((str[i + 1] == 0) && ((result << 20) > result)) {
  44. return result << 20;
  45. }
  46. return 0;
  47. }
  48. if (str[i] < '0' || str[i] > '9') {
  49. return 0;
  50. }
  51. size_t next = (10 * result) + (str[i] - '0');
  52. if (next <= result) {
  53. return 0;
  54. }
  55. result = next;
  56. }
  57. return 0;
  58. }
  59. static std::string readFile(const std::string& path) {
  60. std::ifstream file(path);
  61. std::string content(
  62. (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
  63. return content;
  64. }
  65. static void writeFile(const char* file, const std::string& content) {
  66. std::ofstream outfile(file, std::ofstream::binary);
  67. outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
  68. outfile.close();
  69. }
  70. static void writeSamples(const std::vector<std::string>& paths,
  71. const std::vector<size_t>& sizes, const uint8_t* data) {
  72. size_t offset = 0;
  73. for (size_t i = 0; i < paths.size(); ++i) {
  74. const char* path = paths[i].c_str();
  75. size_t sampleSize = sizes[i];
  76. std::ofstream outfile(path, std::ofstream::binary);
  77. outfile.write(reinterpret_cast<const char*>(data + offset),
  78. static_cast<std::streamsize>(sampleSize));
  79. outfile.close();
  80. offset += sampleSize;
  81. }
  82. }
  83. /* Returns "base file name" or its tail, if it contains '/' or '\'. */
  84. static const char* fileName(const char* path) {
  85. const char* separator_position = strrchr(path, '/');
  86. if (separator_position) path = separator_position + 1;
  87. separator_position = strrchr(path, '\\');
  88. if (separator_position) path = separator_position + 1;
  89. return path;
  90. }
  91. static void printHelp(const char* name) {
  92. fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
  93. fprintf(stderr,
  94. "Options:\n"
  95. " --dm use 'deorummolae' engine\n"
  96. " --distill rewrite samples; unique text parts are removed\n"
  97. " --dsh use 'durchschlag' engine (default)\n"
  98. " --purify rewrite samples; unique text parts are zeroed out\n"
  99. " --sieve use 'sieve' engine\n"
  100. " -b#, --block_len=#\n"
  101. " set block length for 'durchschlag'; default: 1024\n"
  102. " -s#, --slice_len=#\n"
  103. " set slice length for 'distill', 'durchschlag', 'purify'\n"
  104. " and 'sieve'; default: 16\n"
  105. " -t#, --target_dict_len=#\n"
  106. " set target dictionary length (limit); default: 16K\n"
  107. " -u#, --min_slice_pop=#\n"
  108. " set minimum slice population (for rewrites); default: 2\n"
  109. " -c#, --chunk_len=#\n"
  110. " if positive, samples are cut into chunks of this length;\n"
  111. " default: 0; cannot mix with 'rewrite samples'\n"
  112. " -o#, --overlap_len=#\n"
  113. " set chunk overlap length; default 0\n"
  114. "# is a decimal number with optional k/K/m/M suffix.\n"
  115. "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
  116. " Completely unique samples might become empty files.\n\n");
  117. }
  118. int main(int argc, char const* argv[]) {
  119. int dictionaryArg = -1;
  120. int method = METHOD_DURCHSCHLAG;
  121. size_t sliceLen = 16;
  122. size_t targetSize = 16 << 10;
  123. size_t blockSize = 1024;
  124. size_t minimumPopulation = 2;
  125. size_t chunkLen = 0;
  126. size_t overlapLen = 0;
  127. std::vector<uint8_t> data;
  128. std::vector<size_t> sizes;
  129. std::vector<std::string> paths;
  130. size_t total = 0;
  131. for (int i = 1; i < argc; ++i) {
  132. if (argv[i] == nullptr) {
  133. continue;
  134. }
  135. if (argv[i][0] == '-') {
  136. char arg1 = argv[i][1];
  137. const char* arg2 = arg1 ? &argv[i][2] : nullptr;
  138. if (arg1 == '-') {
  139. if (dictionaryArg != -1) {
  140. fprintf(stderr,
  141. "Method should be specified before dictionary / sample '%s'\n",
  142. argv[i]);
  143. exit(1);
  144. }
  145. /* Look for "--long_arg" via exact match. */
  146. if (std::strcmp(argv[i], "--sieve") == 0) {
  147. method = METHOD_SIEVE;
  148. continue;
  149. }
  150. if (std::strcmp(argv[i], "--dm") == 0) {
  151. method = METHOD_DM;
  152. continue;
  153. }
  154. if (std::strcmp(argv[i], "--dsh") == 0) {
  155. method = METHOD_DURCHSCHLAG;
  156. continue;
  157. }
  158. if (std::strcmp(argv[i], "--distill") == 0) {
  159. method = METHOD_DISTILL;
  160. continue;
  161. }
  162. if (std::strcmp(argv[i], "--purify") == 0) {
  163. method = METHOD_PURIFY;
  164. continue;
  165. }
  166. /* Look for "--long_arg=#" via prefix match. */
  167. if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN,
  168. std::strlen(LONG_ARG_BLOCK_LEN)) == 0) {
  169. arg1 = 'b';
  170. arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)];
  171. } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN,
  172. std::strlen(LONG_ARG_SLICE_LEN)) == 0) {
  173. arg1 = 's';
  174. arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)];
  175. } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN,
  176. std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) {
  177. arg1 = 't';
  178. arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)];
  179. } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP,
  180. std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) {
  181. arg1 = 'u';
  182. arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)];
  183. } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN,
  184. std::strlen(LONG_ARG_CHUNK_LEN)) == 0) {
  185. arg1 = 'c';
  186. arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)];
  187. } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN,
  188. std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) {
  189. arg1 = 'o';
  190. arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)];
  191. } else {
  192. printHelp(fileName(argv[0]));
  193. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  194. exit(1);
  195. }
  196. }
  197. /* Look for "-f" short args or "--foo=#" long args. */
  198. if (arg1 == 'b') {
  199. blockSize = readInt(arg2);
  200. if (blockSize < 16 || blockSize > 65536) {
  201. printHelp(fileName(argv[0]));
  202. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  203. exit(1);
  204. }
  205. } else if (arg1 == 's') {
  206. sliceLen = readInt(arg2);
  207. // TODO(eustas): investigate why sliceLen == 4..5 greatly slows down
  208. // durschlag engine, but only from command line;
  209. // durschlag_runner seems to work fine with those.
  210. if (sliceLen < 4 || sliceLen > 256) {
  211. printHelp(fileName(argv[0]));
  212. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  213. exit(1);
  214. }
  215. } else if (arg1 == 't') {
  216. targetSize = readInt(arg2);
  217. if (targetSize < 256 || targetSize > (1 << 25)) {
  218. printHelp(fileName(argv[0]));
  219. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  220. exit(1);
  221. }
  222. } else if (arg1 == 'u') {
  223. minimumPopulation = readInt(arg2);
  224. if (minimumPopulation < 256 || minimumPopulation > 65536) {
  225. printHelp(fileName(argv[0]));
  226. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  227. exit(1);
  228. }
  229. } else if (arg1 == 'c') {
  230. chunkLen = readInt(arg2);
  231. if (chunkLen < 0 || chunkLen > INT_MAX) {
  232. printHelp(fileName(argv[0]));
  233. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  234. exit(1);
  235. }
  236. } else if (arg1 == 'o') {
  237. overlapLen = readInt(arg2);
  238. if (overlapLen < 0 || overlapLen > INT_MAX) {
  239. printHelp(fileName(argv[0]));
  240. fprintf(stderr, "Invalid option '%s'\n", argv[i]);
  241. exit(1);
  242. }
  243. } else {
  244. printHelp(fileName(argv[0]));
  245. fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
  246. exit(1);
  247. }
  248. continue;
  249. }
  250. if (dictionaryArg == -1) {
  251. if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
  252. dictionaryArg = i;
  253. continue;
  254. }
  255. }
  256. bool ok = true;
  257. #if defined(_MSC_VER)
  258. const char* resolved_path = argv[i];
  259. #else
  260. glob_t resolved_paths;
  261. memset(&resolved_paths, 0, sizeof(resolved_paths));
  262. if (glob(argv[i], GLOB_TILDE, NULL, &resolved_paths) == 0) {
  263. for(size_t j = 0; j < resolved_paths.gl_pathc; ++j) {
  264. const char* resolved_path = resolved_paths.gl_pathv[j];
  265. #endif
  266. std::string content = readFile(resolved_path);
  267. if (chunkLen == 0) {
  268. paths.emplace_back(resolved_path);
  269. data.insert(data.end(), content.begin(), content.end());
  270. total += content.size();
  271. sizes.push_back(content.size());
  272. continue;
  273. } else if (chunkLen <= overlapLen) {
  274. printHelp(fileName(argv[0]));
  275. fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
  276. exit(1);
  277. }
  278. for (size_t chunkStart = 0;
  279. chunkStart < content.size();
  280. chunkStart += chunkLen - overlapLen) {
  281. std::string chunk = content.substr(chunkStart, chunkLen);
  282. data.insert(data.end(), chunk.begin(), chunk.end());
  283. total += chunk.size();
  284. sizes.push_back(chunk.size());
  285. }
  286. #if !defined(_MSC_VER)
  287. }
  288. } else {
  289. ok = false;
  290. }
  291. globfree(&resolved_paths);
  292. #endif
  293. if (!ok) exit(1);
  294. }
  295. fprintf(stderr, "Number of chunks: %zu; total size: %zu\n", sizes.size(),
  296. total);
  297. bool wantDictionary = (dictionaryArg == -1);
  298. if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
  299. wantDictionary = false;
  300. if (chunkLen != 0) {
  301. printHelp(fileName(argv[0]));
  302. fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n");
  303. exit(1);
  304. }
  305. }
  306. if (wantDictionary || total == 0) {
  307. printHelp(fileName(argv[0]));
  308. fprintf(stderr, "Not enough arguments\n");
  309. exit(1);
  310. }
  311. if (method == METHOD_SIEVE) {
  312. writeFile(argv[dictionaryArg], sieve_generate(
  313. targetSize, sliceLen, sizes, data.data()));
  314. } else if (method == METHOD_DM) {
  315. writeFile(argv[dictionaryArg], DM_generate(
  316. targetSize, sizes, data.data()));
  317. } else if (method == METHOD_DURCHSCHLAG) {
  318. writeFile(argv[dictionaryArg], durchschlag_generate(
  319. targetSize, sliceLen, blockSize, sizes, data.data()));
  320. } else if (method == METHOD_DISTILL) {
  321. durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
  322. writeSamples(paths, sizes, data.data());
  323. } else if (method == METHOD_PURIFY) {
  324. durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
  325. writeSamples(paths, sizes, data.data());
  326. } else {
  327. printHelp(fileName(argv[0]));
  328. fprintf(stderr, "Unknown generator\n");
  329. exit(1);
  330. }
  331. return 0;
  332. }