unicodetext.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /**
  2. * Copyright 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "include_gunit.h"
  17. #include "util/utf8/unicodetext.h"
  18. #include <string.h> // for memcpy, NULL, memcmp, etc
  19. #include <algorithm> // for max
  20. //#include "base/logging.h" // for operator<<, CHECK, etc
  21. //#include "base/stringprintf.h" // for StringPrintf, StringAppendF
  22. //#include "strings/stringpiece.h" // for StringPiece, etc
  23. #include "third_party/utf/utf.h" // for isvalidcharntorune, etc
  24. #include "util/utf8/unilib.h" // for IsInterchangeValid, etc
  25. #include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
  26. static int CodepointDistance(const char *start, const char *end) {
  27. int n = 0;
  28. // Increment n on every non-trail-byte.
  29. for (const char *p = start; p < end; ++p) {
  30. n += (*reinterpret_cast<const signed char *>(p) >= -0x40);
  31. }
  32. return n;
  33. }
  34. static int CodepointCount(const char *utf8, int len) {
  35. return CodepointDistance(utf8, utf8 + len);
  36. }
  37. UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first,
  38. const UnicodeText::const_iterator &last) {
  39. return CodepointDistance(first.it_, last.it_);
  40. }
  41. // ---------- Utility ----------
  42. static int ConvertToInterchangeValid(char *start, int len) {
  43. // This routine is called only when we've discovered that a UTF-8 buffer
  44. // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
  45. // was not interchange valid. This indicates a bug in the caller, and
  46. // a LOG(WARNING) is done in that case.
  47. // This is similar to CoerceToInterchangeValid, but it replaces each
  48. // structurally valid byte with a space, and each non-interchange
  49. // character with a space, even when that character requires more
  50. // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
  51. // structurally valid UTF8, but U+FDD0 is not an interchange-valid
  52. // code point. The result should contain one space, not three.
  53. //
  54. // Since the conversion never needs to write more data than it
  55. // reads, it is safe to change the buffer in place. It returns the
  56. // number of bytes written.
  57. char *const in = start;
  58. char *out = start;
  59. char *const end = start + len;
  60. while (start < end) {
  61. int good = UniLib::SpanInterchangeValid(start, end - start);
  62. if (good > 0) {
  63. if (out != start) {
  64. memmove(out, start, good);
  65. }
  66. out += good;
  67. start += good;
  68. if (start == end) {
  69. break;
  70. }
  71. }
  72. // Is the current string invalid UTF8 or just non-interchange UTF8?
  73. char32 rune;
  74. int n;
  75. if (isvalidcharntorune(start, end - start, &rune, &n)) {
  76. // structurally valid UTF8, but not interchange valid
  77. start += n; // Skip over the whole character.
  78. } else { // bad UTF8
  79. start += 1; // Skip over just one byte
  80. }
  81. *out++ = ' ';
  82. }
  83. return out - in;
  84. }
  85. // *************** Data representation **********
  86. // Note: the copy constructor is undefined.
  87. // After reserve(), resize(), or clear(), we're an owner, not an alias.
  88. void UnicodeText::Repr::reserve(int new_capacity) {
  89. // If there's already enough capacity, and we're an owner, do nothing.
  90. if (capacity_ >= new_capacity && ours_)
  91. return;
  92. // Otherwise, allocate a new buffer.
  93. capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
  94. char *new_data = new char[capacity_];
  95. // If there is an old buffer, copy it into the new buffer.
  96. if (data_) {
  97. memcpy(new_data, data_, size_);
  98. if (ours_)
  99. delete[] data_; // If we owned the old buffer, free it.
  100. }
  101. data_ = new_data;
  102. ours_ = true; // We own the new buffer.
  103. // size_ is unchanged.
  104. }
  105. void UnicodeText::Repr::resize(int new_size) {
  106. if (new_size == 0) {
  107. clear();
  108. } else {
  109. if (!ours_ || new_size > capacity_)
  110. reserve(new_size);
  111. // Clear the memory in the expanded part.
  112. if (size_ < new_size)
  113. memset(data_ + size_, 0, new_size - size_);
  114. size_ = new_size;
  115. ours_ = true;
  116. }
  117. }
  118. // This implementation of clear() deallocates the buffer if we're an owner.
  119. // That's not strictly necessary; we could just set size_ to 0.
  120. void UnicodeText::Repr::clear() {
  121. if (ours_)
  122. delete[] data_;
  123. data_ = nullptr;
  124. size_ = capacity_ = 0;
  125. ours_ = true;
  126. }
  127. void UnicodeText::Repr::Copy(const char *data, int size) {
  128. resize(size);
  129. memcpy(data_, data, size);
  130. }
  131. void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {
  132. if (data == data_)
  133. return; // We already own this memory. (Weird case.)
  134. if (ours_ && data_)
  135. delete[] data_; // If we owned the old buffer, free it.
  136. data_ = data;
  137. size_ = size;
  138. capacity_ = capacity;
  139. ours_ = true;
  140. }
  141. void UnicodeText::Repr::PointTo(const char *data, int size) {
  142. if (ours_ && data_)
  143. delete[] data_; // If we owned the old buffer, free it.
  144. data_ = const_cast<char *>(data);
  145. size_ = size;
  146. capacity_ = size;
  147. ours_ = false;
  148. }
  149. void UnicodeText::Repr::append(const char *bytes, int byte_length) {
  150. reserve(size_ + byte_length);
  151. memcpy(data_ + size_, bytes, byte_length);
  152. size_ += byte_length;
  153. }
  154. #ifdef INCLUDE_TENSORFLOW
  155. string UnicodeText::Repr::DebugString() const {
  156. return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
  157. capacity_, ours_ ? "Owned" : "Alias");
  158. }
  159. #endif
  160. // *************** UnicodeText ******************
  161. // ----- Constructors -----
  162. // Default constructor
  163. UnicodeText::UnicodeText() {}
  164. // Copy constructor
  165. UnicodeText::UnicodeText(const UnicodeText &src) {
  166. Copy(src);
  167. }
  168. // Substring constructor
  169. UnicodeText::UnicodeText(const UnicodeText::const_iterator &first,
  170. const UnicodeText::const_iterator &last) {
  171. CHECK(first <= last) << " Incompatible iterators";
  172. repr_.append(first.it_, last.it_ - first.it_);
  173. }
  174. string UnicodeText::UTF8Substring(const const_iterator &first, const const_iterator &last) {
  175. CHECK(first <= last) << " Incompatible iterators";
  176. return string(first.it_, last.it_ - first.it_);
  177. }
  178. // ----- Copy -----
  179. UnicodeText &UnicodeText::operator=(const UnicodeText &src) {
  180. if (this != &src) {
  181. Copy(src);
  182. }
  183. return *this;
  184. }
  185. UnicodeText &UnicodeText::Copy(const UnicodeText &src) {
  186. repr_.Copy(src.repr_.data_, src.repr_.size_);
  187. return *this;
  188. }
  189. UnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) {
  190. repr_.Copy(buffer, byte_length);
  191. if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
  192. LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
  193. repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  194. }
  195. return *this;
  196. }
  197. UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {
  198. repr_.Copy(buffer, byte_length);
  199. return *this;
  200. }
  201. // ----- TakeOwnershipOf -----
  202. UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {
  203. repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
  204. if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
  205. LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
  206. repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  207. }
  208. return *this;
  209. }
  210. UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,
  211. int byte_capacity) {
  212. repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
  213. return *this;
  214. }
  215. // ----- PointTo -----
  216. UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
  217. if (UniLib::IsInterchangeValid(buffer, byte_length)) {
  218. repr_.PointTo(buffer, byte_length);
  219. } else {
  220. LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
  221. repr_.Copy(buffer, byte_length);
  222. repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  223. }
  224. return *this;
  225. }
  226. UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {
  227. repr_.PointTo(buffer, byte_length);
  228. return *this;
  229. }
  230. UnicodeText &UnicodeText::PointTo(const UnicodeText &src) {
  231. repr_.PointTo(src.repr_.data_, src.repr_.size_);
  232. return *this;
  233. }
  234. UnicodeText &UnicodeText::PointTo(const const_iterator &first, const const_iterator &last) {
  235. CHECK(first <= last) << " Incompatible iterators";
  236. repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
  237. return *this;
  238. }
  239. // ----- Append -----
  240. UnicodeText &UnicodeText::append(const UnicodeText &u) {
  241. repr_.append(u.repr_.data_, u.repr_.size_);
  242. return *this;
  243. }
  244. UnicodeText &UnicodeText::append(const const_iterator &first, const const_iterator &last) {
  245. CHECK(first <= last) << " Incompatible iterators";
  246. repr_.append(first.it_, last.it_ - first.it_);
  247. return *this;
  248. }
  249. UnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) {
  250. repr_.append(utf8, len);
  251. return *this;
  252. }
  253. // ----- substring searching -----
  254. UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look,
  255. const_iterator start_pos) const {
  256. CHECK_GE(start_pos.utf8_data(), utf8_data());
  257. CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
  258. return UnsafeFind(look, start_pos);
  259. }
  260. UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look) const {
  261. return UnsafeFind(look, begin());
  262. }
  263. UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
  264. const_iterator start_pos) const {
  265. // Due to the magic of the UTF8 encoding, searching for a sequence of
  266. // letters is equivalent to substring search.
  267. #ifdef INCLUDE_TENSORFLOW
  268. StringPiece searching(utf8_data(), utf8_length());
  269. StringPiece look_piece(look.utf8_data(), look.utf8_length());
  270. #endif
  271. LOG(FATAL) << "Not implemented";
  272. #ifdef INCLUDE_TENSORFLOW
  273. // StringPiece::size_type found =
  274. // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
  275. StringPiece::size_type found = StringPiece::npos;
  276. if (found == StringPiece::npos)
  277. return end();
  278. return const_iterator(utf8_data() + found);
  279. #else
  280. return end();
  281. #endif
  282. }
  283. #ifdef INCLUDE_TENSORFLOW
  284. bool UnicodeText::HasReplacementChar() const {
  285. // Equivalent to:
  286. // UnicodeText replacement_char;
  287. // replacement_char.push_back(0xFFFD);
  288. // return find(replacement_char) != end();
  289. StringPiece searching(utf8_data(), utf8_length());
  290. StringPiece looking_for("\xEF\xBF\xBD", 3);
  291. LOG(FATAL) << "Not implemented";
  292. // return searching.find(looking_for) != StringPiece::npos;
  293. return false;
  294. }
  295. #endif
  296. // ----- other methods -----
  297. // Clear operator
  298. void UnicodeText::clear() {
  299. repr_.clear();
  300. }
  301. // Destructor
  302. UnicodeText::~UnicodeText() {}
  303. void UnicodeText::push_back(char32 c) {
  304. if (UniLib::IsValidCodepoint(c)) {
  305. char buf[UTFmax];
  306. int len = runetochar(buf, &c);
  307. if (UniLib::IsInterchangeValid(buf, len)) {
  308. repr_.append(buf, len);
  309. } else {
  310. LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
  311. repr_.append(" ", 1);
  312. }
  313. } else {
  314. LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
  315. repr_.append(" ", 1);
  316. }
  317. }
  318. int UnicodeText::size() const {
  319. return CodepointCount(repr_.data_, repr_.size_);
  320. }
  321. bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
  322. if (&lhs == &rhs)
  323. return true;
  324. if (lhs.repr_.size_ != rhs.repr_.size_)
  325. return false;
  326. return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
  327. }
  328. #ifdef INCLUDE_TENSORFLOW
  329. string UnicodeText::DebugString() const {
  330. return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
  331. repr_.DebugString().c_str());
  332. }
  333. #endif
  334. // ******************* UnicodeText::const_iterator *********************
  335. // The implementation of const_iterator would be nicer if it
  336. // inherited from boost::iterator_facade
  337. // (http://boost.org/libs/iterator/doc/iterator_facade.html).
  338. UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
  339. UnicodeText::const_iterator::const_iterator(const const_iterator &other) : it_(other.it_) {}
  340. UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(const const_iterator &other) {
  341. if (&other != this)
  342. it_ = other.it_;
  343. return *this;
  344. }
  345. UnicodeText::const_iterator UnicodeText::begin() const {
  346. return const_iterator(repr_.data_);
  347. }
  348. UnicodeText::const_iterator UnicodeText::end() const {
  349. return const_iterator(repr_.data_ + repr_.size_);
  350. }
  351. bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs) {
  352. return lhs.it_ < rhs.it_;
  353. }
  354. char32 UnicodeText::const_iterator::operator*() const {
  355. // (We could call chartorune here, but that does some
  356. // error-checking, and we're guaranteed that our data is valid
  357. // UTF-8. Also, we expect this routine to be called very often. So
  358. // for speed, we do the calculation ourselves.)
  359. // Convert from UTF-8
  360. unsigned char byte1 = it_[0];
  361. if (byte1 < 0x80)
  362. return byte1;
  363. unsigned char byte2 = it_[1];
  364. if (byte1 < 0xE0)
  365. return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
  366. unsigned char byte3 = it_[2];
  367. if (byte1 < 0xF0)
  368. return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
  369. unsigned char byte4 = it_[3];
  370. return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
  371. }
  372. UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
  373. it_ += UniLib::OneCharLen(it_);
  374. return *this;
  375. }
  376. UnicodeText::const_iterator &UnicodeText::const_iterator::operator--() {
  377. while (UniLib::IsTrailByte(*--it_))
  378. ;
  379. return *this;
  380. }
  381. int UnicodeText::const_iterator::get_utf8(char *utf8_output) const {
  382. utf8_output[0] = it_[0];
  383. if ((it_[0] & 0xff) < 0x80)
  384. return 1;
  385. utf8_output[1] = it_[1];
  386. if ((it_[0] & 0xff) < 0xE0)
  387. return 2;
  388. utf8_output[2] = it_[2];
  389. if ((it_[0] & 0xff) < 0xF0)
  390. return 3;
  391. utf8_output[3] = it_[3];
  392. return 4;
  393. }
  394. string UnicodeText::const_iterator::get_utf8_string() const {
  395. return string(utf8_data(), utf8_length());
  396. }
  397. int UnicodeText::const_iterator::utf8_length() const {
  398. if ((it_[0] & 0xff) < 0x80) {
  399. return 1;
  400. } else if ((it_[0] & 0xff) < 0xE0) {
  401. return 2;
  402. } else if ((it_[0] & 0xff) < 0xF0) {
  403. return 3;
  404. } else {
  405. return 4;
  406. }
  407. }
  408. UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
  409. CHECK(p != nullptr);
  410. const char *start = utf8_data();
  411. int len = utf8_length();
  412. const char *end = start + len;
  413. CHECK(p >= start);
  414. CHECK(p <= end);
  415. CHECK(p == end || !UniLib::IsTrailByte(*p));
  416. return const_iterator(p);
  417. }
  418. #ifdef INCLUDE_TENSORFLOW
  419. string UnicodeText::const_iterator::DebugString() const {
  420. return tensorflow::strings::Printf("{iter %p}", it_);
  421. }
  422. // *************************** Utilities *************************
  423. string CodepointString(const UnicodeText &t) {
  424. string s;
  425. UnicodeText::const_iterator it = t.begin(), end = t.end();
  426. while (it != end)
  427. tensorflow::strings::Appendf(&s, "%X ", *it++);
  428. return s;
  429. }
  430. #endif