unicodetext.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. /**
  2. * Copyright 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
  17. #define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
  18. #include <stddef.h> // for NULL, ptrdiff_t
  19. #include <iterator> // for bidirectional_iterator_tag, etc
  20. #include <string> // for string
  21. #include <utility> // for pair
  22. #include "syntaxnet/base.h"
  23. // ***************************** UnicodeText **************************
  24. //
  25. // A UnicodeText object is a container for a sequence of Unicode
  26. // codepoint values. It has default, copy, and assignment constructors.
  27. // Data can be appended to it from another UnicodeText, from
  28. // iterators, or from a single codepoint.
  29. //
  30. // The internal representation of the text is UTF-8. Since UTF-8 is a
  31. // variable-width format, UnicodeText does not provide random access
  32. // to the text, and changes to the text are permitted only at the end.
  33. //
  34. // The UnicodeText class defines a const_iterator. The dereferencing
  35. // operator (*) returns a codepoint (char32). The iterator is a
  36. // bidirectional, read-only iterator. It becomes invalid if the text
  37. // is changed.
  38. //
  39. // There are methods for appending and retrieving UTF-8 data directly.
  40. // The 'utf8_data' method returns a const char* that contains the
  41. // UTF-8-encoded version of the text; 'utf8_length' returns the number
  42. // of bytes in the UTF-8 data. An iterator's 'get' method stores up to
  43. // 4 bytes of UTF-8 data in a char array and returns the number of
  44. // bytes that it stored.
  45. //
  46. // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
  47. // 0x10FFFF], but UnicodeText has the additional restriction that it
  48. // can contain only those characters that are valid for interchange on
  49. // the Web. This excludes all of the control codes except for carriage
  50. // return, line feed, and horizontal tab. It also excludes
  51. // non-characters, but codepoints that are in the Private Use regions
  52. // are allowed, as are codepoints that are unassigned. (See the
  53. // Unicode reference for details.) The function UniLib::IsInterchangeValid
  54. // can be used as a test for this property.
  55. //
  56. // UnicodeTexts are safe. Every method that constructs or modifies a
  57. // UnicodeText tests for interchange-validity, and will substitute a
  58. // space for the invalid data. Such cases are reported via
  59. // LOG(WARNING).
  60. //
  61. // MEMORY MANAGEMENT: copy, take ownership, or point to
  62. //
  63. // A UnicodeText is either an "owner", meaning that it owns the memory
  64. // for the data buffer and will free it when the UnicodeText is
  65. // destroyed, or it is an "alias", meaning that it does not.
  66. //
  67. // There are three methods for storing UTF-8 data in a UnicodeText:
  68. //
  69. // CopyUTF8(buffer, len) copies buffer.
  70. //
  71. // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
  72. //
  73. // PointToUTF8(buffer, size) creates an alias pointing to buffer.
  74. //
  75. // All three methods perform a validity check on the buffer. There are
  76. // private, "unsafe" versions of these functions that bypass the
  77. // validity check. They are used internally and by friend-functions
  78. // that are handling UTF-8 data that has already been validated.
  79. //
  80. // The purpose of an alias is to avoid making an unnecessary copy of a
  81. // UTF-8 buffer while still providing access to the Unicode values
  82. // within that text through iterators or the fast scanners that are
  83. // based on UTF-8 state tables. The lifetime of an alias must not
  84. // exceed the lifetime of the buffer from which it was constructed.
  85. //
  86. // The semantics of an alias might be described as "copy on write or
  87. // repair." The source data is never modified. If push_back() or
  88. // append() is called on an alias, a copy of the data will be created,
  89. // and the UnicodeText will become an owner. If clear() is called on
  90. // an alias, it becomes an (empty) owner.
  91. //
  92. // The copy constructor and the assignment operator produce an owner.
  93. // That is, after direct initialization ("UnicodeText x(y);") or copy
  94. // initialization ("UnicodeText x = y;") x will be an owner, even if y
  95. // was an alias. The assignment operator ("x = y;") also produces an
  96. // owner unless x and y are the same object and y is an alias.
  97. //
  98. // Aliases should be used with care. If the source from which an alias
  99. // was created is freed, or if the contents are changed, while the
  100. // alias is still in use, fatal errors could result. But it can be
  101. // quite useful to have a UnicodeText "window" through which to see a
  102. // UTF-8 buffer without having to pay the price of making a copy.
  103. //
  104. // UTILITIES
  105. //
  106. // The interfaces in util/utf8/public/textutils.h provide higher-level
  107. // utilities for dealing with UnicodeTexts, including routines for
  108. // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
  109. // strings, creating strings from UnicodeTexts, normalizing text for
  110. // efficient matching or display, and others.
  111. class UnicodeText {
  112. public:
  113. class const_iterator;
  114. typedef char32 value_type;
  115. // Constructors. These always produce owners.
  116. UnicodeText(); // Create an empty text.
  117. UnicodeText(const UnicodeText &src); // copy constructor
  118. // Construct a substring (copies the data).
  119. UnicodeText(const const_iterator &first, const const_iterator &last);
  120. // Assignment operator. This copies the data and produces an owner
  121. // unless this == &src, e.g., "x = x;", which is a no-op.
  122. UnicodeText &operator=(const UnicodeText &src);
  123. // x.Copy(y) copies the data from y into x.
  124. UnicodeText &Copy(const UnicodeText &src);
  125. inline UnicodeText &assign(const UnicodeText &src) {
  126. return Copy(src);
  127. }
  128. // x.PointTo(y) changes x so that it points to y's data.
  129. // It does not copy y or take ownership of y's data.
  130. UnicodeText &PointTo(const UnicodeText &src);
  131. UnicodeText &PointTo(const const_iterator &first, const const_iterator &last);
  132. ~UnicodeText();
  133. void clear(); // Clear text.
  134. bool empty() const {
  135. return repr_.size_ == 0;
  136. } // Test if text is empty.
  137. // Add a codepoint to the end of the text.
  138. // If the codepoint is not interchange-valid, add a space instead
  139. // and log a warning.
  140. void push_back(char32 codepoint);
  141. // Generic appending operation.
  142. // iterator_traits<ForwardIterator>::value_type must be implicitly
  143. // convertible to char32. Typical uses of this method might include:
  144. // char32 chars[] = {0x1, 0x2, ...};
  145. // vector<char32> more_chars = ...;
  146. // utext.append(chars, chars+arraysize(chars));
  147. // utext.append(more_chars.begin(), more_chars.end());
  148. template <typename ForwardIterator>
  149. UnicodeText &append(ForwardIterator first, const ForwardIterator last) {
  150. while (first != last) {
  151. push_back(*first++);
  152. }
  153. return *this;
  154. }
  155. // A specialization of the generic append() method.
  156. UnicodeText &append(const const_iterator &first, const const_iterator &last);
  157. // An optimization of append(source.begin(), source.end()).
  158. UnicodeText &append(const UnicodeText &source);
  159. int size() const; // the number of Unicode characters (codepoints)
  160. friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
  161. friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs);
  162. class const_iterator {
  163. typedef const_iterator CI;
  164. public:
  165. typedef std::bidirectional_iterator_tag iterator_category;
  166. typedef char32 value_type;
  167. typedef ptrdiff_t difference_type;
  168. typedef void pointer; // (Not needed.)
  169. typedef const char32 reference; // (Needed for const_reverse_iterator)
  170. // Iterators are default-constructible.
  171. const_iterator();
  172. // It's safe to make multiple passes over a UnicodeText.
  173. const_iterator(const const_iterator &other);
  174. const_iterator &operator=(const const_iterator &other);
  175. char32 operator*() const; // Dereference
  176. const_iterator &operator++(); // Advance (++iter)
  177. const_iterator operator++(int) { // (iter++)
  178. const_iterator result(*this);
  179. ++*this;
  180. return result;
  181. }
  182. const_iterator &operator--(); // Retreat (--iter)
  183. const_iterator operator--(int) { // (iter--)
  184. const_iterator result(*this);
  185. --*this;
  186. return result;
  187. }
  188. // We love relational operators.
  189. friend bool operator==(const CI &lhs, const CI &rhs) {
  190. return lhs.it_ == rhs.it_;
  191. }
  192. friend bool operator!=(const CI &lhs, const CI &rhs) {
  193. return !(lhs == rhs);
  194. }
  195. friend bool operator<(const CI &lhs, const CI &rhs);
  196. friend bool operator>(const CI &lhs, const CI &rhs) {
  197. return rhs < lhs;
  198. }
  199. friend bool operator<=(const CI &lhs, const CI &rhs) {
  200. return !(rhs < lhs);
  201. }
  202. friend bool operator>=(const CI &lhs, const CI &rhs) {
  203. return !(lhs < rhs);
  204. }
  205. friend difference_type distance(const CI &first, const CI &last);
  206. // UTF-8-specific methods
  207. // Store the UTF-8 encoding of the current codepoint into buf,
  208. // which must be at least 4 bytes long. Return the number of
  209. // bytes written.
  210. int get_utf8(char *buf) const;
  211. // Return the UTF-8 character that the iterator points to.
  212. string get_utf8_string() const;
  213. // Return the byte length of the UTF-8 character the iterator points to.
  214. int utf8_length() const;
  215. // Return the iterator's pointer into the UTF-8 data.
  216. const char *utf8_data() const {
  217. return it_;
  218. }
  219. string DebugString() const;
  220. private:
  221. friend class UnicodeText;
  222. friend class UnicodeTextUtils;
  223. friend class UTF8StateTableProperty;
  224. explicit const_iterator(const char *it) : it_(it) {}
  225. const char *it_;
  226. };
  227. const_iterator begin() const;
  228. const_iterator end() const;
  229. class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
  230. public:
  231. explicit const_reverse_iterator(const_iterator it)
  232. : std::reverse_iterator<const_iterator>(it) {}
  233. const char *utf8_data() const {
  234. const_iterator tmp_it = base();
  235. return (--tmp_it).utf8_data();
  236. }
  237. int get_utf8(char *buf) const {
  238. const_iterator tmp_it = base();
  239. return (--tmp_it).get_utf8(buf);
  240. }
  241. string get_utf8_string() const {
  242. const_iterator tmp_it = base();
  243. return (--tmp_it).get_utf8_string();
  244. }
  245. int utf8_length() const {
  246. const_iterator tmp_it = base();
  247. return (--tmp_it).utf8_length();
  248. }
  249. };
  250. const_reverse_iterator rbegin() const {
  251. return const_reverse_iterator(end());
  252. }
  253. const_reverse_iterator rend() const {
  254. return const_reverse_iterator(begin());
  255. }
  256. // Substring searching. Returns the beginning of the first
  257. // occurrence of "look", or end() if not found.
  258. const_iterator find(const UnicodeText &look, const_iterator start_pos) const;
  259. // Equivalent to find(look, begin())
  260. const_iterator find(const UnicodeText &look) const;
  261. // Returns whether this contains the character U+FFFD. This can
  262. // occur, for example, if the input to Encodings::Decode() had byte
  263. // sequences that were invalid in the source encoding.
  264. bool HasReplacementChar() const;
  265. // UTF-8-specific methods
  266. //
  267. // Return the data, length, and capacity of UTF-8-encoded version of
  268. // the text. Length and capacity are measured in bytes.
  269. const char *utf8_data() const {
  270. return repr_.data_;
  271. }
  272. int utf8_length() const {
  273. return repr_.size_;
  274. }
  275. int utf8_capacity() const {
  276. return repr_.capacity_;
  277. }
  278. // Return the UTF-8 data as a string.
  279. static string UTF8Substring(const const_iterator &first, const const_iterator &last);
  280. // There are three methods for initializing a UnicodeText from UTF-8
  281. // data. They vary in details of memory management. In all cases,
  282. // the data is tested for interchange-validity. If it is not
  283. // interchange-valid, a LOG(WARNING) is issued, and each
  284. // structurally invalid byte and each interchange-invalid codepoint
  285. // is replaced with a space.
  286. // x.CopyUTF8(buf, len) copies buf into x.
  287. UnicodeText &CopyUTF8(const char *utf8_buffer, int byte_length);
  288. // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
  289. // buf. buf is not copied.
  290. UnicodeText &TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
  291. // x.PointToUTF8(buf,len) changes x so that it points to buf
  292. // ("becomes an alias"). It does not take ownership or copy buf.
  293. // If the buffer is not valid, this has the same effect as
  294. // CopyUTF8(utf8_buffer, byte_length).
  295. UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length);
  296. // Occasionally it is necessary to use functions that operate on the
  297. // pointer returned by utf8_data(). MakeIterator(p) provides a way
  298. // to get back to the UnicodeText level. It uses CHECK to ensure
  299. // that p is a pointer within this object's UTF-8 data, and that it
  300. // points to the beginning of a character.
  301. const_iterator MakeIterator(const char *p) const;
  302. string DebugString() const;
  303. private:
  304. friend class const_iterator;
  305. friend class UnicodeTextUtils;
  306. class Repr { // A byte-string.
  307. public:
  308. char *data_;
  309. int size_;
  310. int capacity_;
  311. bool ours_; // Do we own data_?
  312. Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
  313. ~Repr() {
  314. if (ours_)
  315. delete[] data_;
  316. }
  317. void clear();
  318. void reserve(int capacity);
  319. void resize(int size);
  320. void append(const char *bytes, int byte_length);
  321. void Copy(const char *data, int size);
  322. void TakeOwnershipOf(char *data, int size, int capacity);
  323. void PointTo(const char *data, int size);
  324. string DebugString() const;
  325. private:
  326. Repr &operator=(const Repr &);
  327. Repr(const Repr &other);
  328. };
  329. Repr repr_;
  330. // UTF-8-specific private methods.
  331. // These routines do not perform a validity check when compiled
  332. // in opt mode.
  333. // It is an error to call these methods with UTF-8 data that
  334. // is not interchange-valid.
  335. //
  336. UnicodeText &UnsafeCopyUTF8(const char *utf8_buffer, int byte_length);
  337. UnicodeText &UnsafeTakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
  338. UnicodeText &UnsafePointToUTF8(const char *utf8_buffer, int byte_length);
  339. UnicodeText &UnsafeAppendUTF8(const char *utf8_buffer, int byte_length);
  340. const_iterator UnsafeFind(const UnicodeText &look, const_iterator start_pos) const;
  341. };
  342. bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
  343. inline bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs) {
  344. return !(lhs == rhs);
  345. }
  346. // UnicodeTextRange is a pair of iterators, useful for specifying text
  347. // segments. If the iterators are ==, the segment is empty.
  348. typedef pair<UnicodeText::const_iterator, UnicodeText::const_iterator> UnicodeTextRange;
  349. inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange &r) {
  350. return r.first == r.second;
  351. }
  352. // *************************** Utilities *************************
  353. // A factory function for creating a UnicodeText from a buffer of
  354. // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
  355. // is an "owner.")
  356. //
  357. // Each byte that is structurally invalid will be replaced with a
  358. // space. Each codepoint that is interchange-invalid will also be
  359. // replaced with a space, even if the codepoint was represented with a
  360. // multibyte sequence in the UTF-8 data.
  361. //
  362. inline UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length,
  363. int byte_capacity) {
  364. return UnicodeText().TakeOwnershipOfUTF8(utf8_buffer, byte_length, byte_capacity);
  365. }
  366. // A factory function for creating a UnicodeText from a buffer of
  367. // UTF-8 data. The new UnicodeText does not take ownership of the
  368. // buffer. (It is an "alias.")
  369. //
  370. inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(const char *utf8_buffer,
  371. int byte_length) {
  372. return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
  373. }
  374. // Create a UnicodeText from a UTF-8 string or buffer.
  375. //
  376. // If do_copy is true, then a copy of the string is made. The copy is
  377. // owned by the resulting UnicodeText object and will be freed when
  378. // the object is destroyed. This UnicodeText object is referred to
  379. // as an "owner."
  380. //
  381. // If do_copy is false, then no copy is made. The resulting
  382. // UnicodeText object does NOT take ownership of the string; in this
  383. // case, the lifetime of the UnicodeText object must not exceed the
  384. // lifetime of the string. This Unicodetext object is referred to as
  385. // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
  386. //
  387. // If the input string does not contain valid UTF-8, then a copy is
  388. // made (as if do_copy were true) and coerced to valid UTF-8 by
  389. // replacing each invalid byte with a space.
  390. //
  391. inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy) {
  392. UnicodeText t;
  393. if (do_copy) {
  394. t.CopyUTF8(utf8_buf, len);
  395. } else {
  396. t.PointToUTF8(utf8_buf, len);
  397. }
  398. return t;
  399. }
  400. inline UnicodeText UTF8ToUnicodeText(const string &utf_string, bool do_copy) {
  401. return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
  402. }
  403. inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len) {
  404. return UTF8ToUnicodeText(utf8_buf, len, true);
  405. }
  406. inline UnicodeText UTF8ToUnicodeText(const string &utf8_string) {
  407. return UTF8ToUnicodeText(utf8_string, true);
  408. }
  409. // Return a string containing the UTF-8 encoded version of all the
  410. // Unicode characters in t.
  411. inline string UnicodeTextToUTF8(const UnicodeText &t) {
  412. return string(t.utf8_data(), t.utf8_length());
  413. }
  414. // This template function declaration is used in defining arraysize.
  415. // Note that the function doesn't need an implementation, as we only
  416. // use its type.
  417. template <typename T, size_t N>
  418. char (&ArraySizeHelper(T (&array)[N]))[N];
  419. #define arraysize(array) (sizeof(ArraySizeHelper(array)))
  420. // For debugging. Return a string of integers, written in uppercase
  421. // hex (%X), corresponding to the codepoints within the text. Each
  422. // integer is followed by a space. E.g., "61 62 6A 3005 ".
  423. string CodepointString(const UnicodeText &t);
  424. #endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_