| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509 |
- /**
- * Copyright 2010 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "include_gunit.h"
- #include "util/utf8/unicodetext.h"
- #include <string.h> // for memcpy, NULL, memcmp, etc
- #include <algorithm> // for max
- //#include "base/logging.h" // for operator<<, CHECK, etc
- //#include "base/stringprintf.h" // for StringPrintf, StringAppendF
- //#include "strings/stringpiece.h" // for StringPiece, etc
- #include "third_party/utf/utf.h" // for isvalidcharntorune, etc
- #include "util/utf8/unilib.h" // for IsInterchangeValid, etc
- #include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
- static int CodepointDistance(const char *start, const char *end) {
- int n = 0;
- // Increment n on every non-trail-byte.
- for (const char *p = start; p < end; ++p) {
- n += (*reinterpret_cast<const signed char *>(p) >= -0x40);
- }
- return n;
- }
- static int CodepointCount(const char *utf8, int len) {
- return CodepointDistance(utf8, utf8 + len);
- }
- UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first,
- const UnicodeText::const_iterator &last) {
- return CodepointDistance(first.it_, last.it_);
- }
- // ---------- Utility ----------
- static int ConvertToInterchangeValid(char *start, int len) {
- // This routine is called only when we've discovered that a UTF-8 buffer
- // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
- // was not interchange valid. This indicates a bug in the caller, and
- // a LOG(WARNING) is done in that case.
- // This is similar to CoerceToInterchangeValid, but it replaces each
- // structurally valid byte with a space, and each non-interchange
- // character with a space, even when that character requires more
- // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
- // structurally valid UTF8, but U+FDD0 is not an interchange-valid
- // code point. The result should contain one space, not three.
- //
- // Since the conversion never needs to write more data than it
- // reads, it is safe to change the buffer in place. It returns the
- // number of bytes written.
- char *const in = start;
- char *out = start;
- char *const end = start + len;
- while (start < end) {
- int good = UniLib::SpanInterchangeValid(start, end - start);
- if (good > 0) {
- if (out != start) {
- memmove(out, start, good);
- }
- out += good;
- start += good;
- if (start == end) {
- break;
- }
- }
- // Is the current string invalid UTF8 or just non-interchange UTF8?
- char32 rune;
- int n;
- if (isvalidcharntorune(start, end - start, &rune, &n)) {
- // structurally valid UTF8, but not interchange valid
- start += n; // Skip over the whole character.
- } else { // bad UTF8
- start += 1; // Skip over just one byte
- }
- *out++ = ' ';
- }
- return out - in;
- }
- // *************** Data representation **********
- // Note: the copy constructor is undefined.
- // After reserve(), resize(), or clear(), we're an owner, not an alias.
- void UnicodeText::Repr::reserve(int new_capacity) {
- // If there's already enough capacity, and we're an owner, do nothing.
- if (capacity_ >= new_capacity && ours_)
- return;
- // Otherwise, allocate a new buffer.
- capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
- char *new_data = new char[capacity_];
- // If there is an old buffer, copy it into the new buffer.
- if (data_) {
- memcpy(new_data, data_, size_);
- if (ours_)
- delete[] data_; // If we owned the old buffer, free it.
- }
- data_ = new_data;
- ours_ = true; // We own the new buffer.
- // size_ is unchanged.
- }
- void UnicodeText::Repr::resize(int new_size) {
- if (new_size == 0) {
- clear();
- } else {
- if (!ours_ || new_size > capacity_)
- reserve(new_size);
- // Clear the memory in the expanded part.
- if (size_ < new_size)
- memset(data_ + size_, 0, new_size - size_);
- size_ = new_size;
- ours_ = true;
- }
- }
- // This implementation of clear() deallocates the buffer if we're an owner.
- // That's not strictly necessary; we could just set size_ to 0.
- void UnicodeText::Repr::clear() {
- if (ours_)
- delete[] data_;
- data_ = nullptr;
- size_ = capacity_ = 0;
- ours_ = true;
- }
- void UnicodeText::Repr::Copy(const char *data, int size) {
- resize(size);
- memcpy(data_, data, size);
- }
- void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {
- if (data == data_)
- return; // We already own this memory. (Weird case.)
- if (ours_ && data_)
- delete[] data_; // If we owned the old buffer, free it.
- data_ = data;
- size_ = size;
- capacity_ = capacity;
- ours_ = true;
- }
- void UnicodeText::Repr::PointTo(const char *data, int size) {
- if (ours_ && data_)
- delete[] data_; // If we owned the old buffer, free it.
- data_ = const_cast<char *>(data);
- size_ = size;
- capacity_ = size;
- ours_ = false;
- }
- void UnicodeText::Repr::append(const char *bytes, int byte_length) {
- reserve(size_ + byte_length);
- memcpy(data_ + size_, bytes, byte_length);
- size_ += byte_length;
- }
- #ifdef INCLUDE_TENSORFLOW
- string UnicodeText::Repr::DebugString() const {
- return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
- capacity_, ours_ ? "Owned" : "Alias");
- }
- #endif
- // *************** UnicodeText ******************
- // ----- Constructors -----
- // Default constructor
- UnicodeText::UnicodeText() {}
- // Copy constructor
- UnicodeText::UnicodeText(const UnicodeText &src) {
- Copy(src);
- }
- // Substring constructor
- UnicodeText::UnicodeText(const UnicodeText::const_iterator &first,
- const UnicodeText::const_iterator &last) {
- CHECK(first <= last) << " Incompatible iterators";
- repr_.append(first.it_, last.it_ - first.it_);
- }
- string UnicodeText::UTF8Substring(const const_iterator &first, const const_iterator &last) {
- CHECK(first <= last) << " Incompatible iterators";
- return string(first.it_, last.it_ - first.it_);
- }
- // ----- Copy -----
- UnicodeText &UnicodeText::operator=(const UnicodeText &src) {
- if (this != &src) {
- Copy(src);
- }
- return *this;
- }
- UnicodeText &UnicodeText::Copy(const UnicodeText &src) {
- repr_.Copy(src.repr_.data_, src.repr_.size_);
- return *this;
- }
- UnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) {
- repr_.Copy(buffer, byte_length);
- if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
- LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
- repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
- }
- return *this;
- }
- UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {
- repr_.Copy(buffer, byte_length);
- return *this;
- }
- // ----- TakeOwnershipOf -----
- UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {
- repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
- if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
- LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
- repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
- }
- return *this;
- }
- UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,
- int byte_capacity) {
- repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
- return *this;
- }
- // ----- PointTo -----
- UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
- if (UniLib::IsInterchangeValid(buffer, byte_length)) {
- repr_.PointTo(buffer, byte_length);
- } else {
- LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
- repr_.Copy(buffer, byte_length);
- repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
- }
- return *this;
- }
- UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {
- repr_.PointTo(buffer, byte_length);
- return *this;
- }
- UnicodeText &UnicodeText::PointTo(const UnicodeText &src) {
- repr_.PointTo(src.repr_.data_, src.repr_.size_);
- return *this;
- }
- UnicodeText &UnicodeText::PointTo(const const_iterator &first, const const_iterator &last) {
- CHECK(first <= last) << " Incompatible iterators";
- repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
- return *this;
- }
- // ----- Append -----
- UnicodeText &UnicodeText::append(const UnicodeText &u) {
- repr_.append(u.repr_.data_, u.repr_.size_);
- return *this;
- }
- UnicodeText &UnicodeText::append(const const_iterator &first, const const_iterator &last) {
- CHECK(first <= last) << " Incompatible iterators";
- repr_.append(first.it_, last.it_ - first.it_);
- return *this;
- }
- UnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) {
- repr_.append(utf8, len);
- return *this;
- }
- // ----- substring searching -----
- UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look,
- const_iterator start_pos) const {
- CHECK_GE(start_pos.utf8_data(), utf8_data());
- CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
- return UnsafeFind(look, start_pos);
- }
- UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look) const {
- return UnsafeFind(look, begin());
- }
- UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
- const_iterator start_pos) const {
- // Due to the magic of the UTF8 encoding, searching for a sequence of
- // letters is equivalent to substring search.
- #ifdef INCLUDE_TENSORFLOW
- StringPiece searching(utf8_data(), utf8_length());
- StringPiece look_piece(look.utf8_data(), look.utf8_length());
- #endif
- LOG(FATAL) << "Not implemented";
- #ifdef INCLUDE_TENSORFLOW
- // StringPiece::size_type found =
- // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
- StringPiece::size_type found = StringPiece::npos;
- if (found == StringPiece::npos)
- return end();
- return const_iterator(utf8_data() + found);
- #else
- return end();
- #endif
- }
- #ifdef INCLUDE_TENSORFLOW
- bool UnicodeText::HasReplacementChar() const {
- // Equivalent to:
- // UnicodeText replacement_char;
- // replacement_char.push_back(0xFFFD);
- // return find(replacement_char) != end();
- StringPiece searching(utf8_data(), utf8_length());
- StringPiece looking_for("\xEF\xBF\xBD", 3);
- LOG(FATAL) << "Not implemented";
- // return searching.find(looking_for) != StringPiece::npos;
- return false;
- }
- #endif
- // ----- other methods -----
- // Clear operator
- void UnicodeText::clear() {
- repr_.clear();
- }
- // Destructor
- UnicodeText::~UnicodeText() {}
- void UnicodeText::push_back(char32 c) {
- if (UniLib::IsValidCodepoint(c)) {
- char buf[UTFmax];
- int len = runetochar(buf, &c);
- if (UniLib::IsInterchangeValid(buf, len)) {
- repr_.append(buf, len);
- } else {
- LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
- repr_.append(" ", 1);
- }
- } else {
- LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
- repr_.append(" ", 1);
- }
- }
- int UnicodeText::size() const {
- return CodepointCount(repr_.data_, repr_.size_);
- }
- bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
- if (&lhs == &rhs)
- return true;
- if (lhs.repr_.size_ != rhs.repr_.size_)
- return false;
- return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
- }
- #ifdef INCLUDE_TENSORFLOW
- string UnicodeText::DebugString() const {
- return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
- repr_.DebugString().c_str());
- }
- #endif
- // ******************* UnicodeText::const_iterator *********************
- // The implementation of const_iterator would be nicer if it
- // inherited from boost::iterator_facade
- // (http://boost.org/libs/iterator/doc/iterator_facade.html).
- UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
- UnicodeText::const_iterator::const_iterator(const const_iterator &other) : it_(other.it_) {}
- UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(const const_iterator &other) {
- if (&other != this)
- it_ = other.it_;
- return *this;
- }
- UnicodeText::const_iterator UnicodeText::begin() const {
- return const_iterator(repr_.data_);
- }
- UnicodeText::const_iterator UnicodeText::end() const {
- return const_iterator(repr_.data_ + repr_.size_);
- }
- bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs) {
- return lhs.it_ < rhs.it_;
- }
- char32 UnicodeText::const_iterator::operator*() const {
- // (We could call chartorune here, but that does some
- // error-checking, and we're guaranteed that our data is valid
- // UTF-8. Also, we expect this routine to be called very often. So
- // for speed, we do the calculation ourselves.)
- // Convert from UTF-8
- unsigned char byte1 = it_[0];
- if (byte1 < 0x80)
- return byte1;
- unsigned char byte2 = it_[1];
- if (byte1 < 0xE0)
- return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
- unsigned char byte3 = it_[2];
- if (byte1 < 0xF0)
- return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
- unsigned char byte4 = it_[3];
- return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
- }
- UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
- it_ += UniLib::OneCharLen(it_);
- return *this;
- }
- UnicodeText::const_iterator &UnicodeText::const_iterator::operator--() {
- while (UniLib::IsTrailByte(*--it_))
- ;
- return *this;
- }
- int UnicodeText::const_iterator::get_utf8(char *utf8_output) const {
- utf8_output[0] = it_[0];
- if ((it_[0] & 0xff) < 0x80)
- return 1;
- utf8_output[1] = it_[1];
- if ((it_[0] & 0xff) < 0xE0)
- return 2;
- utf8_output[2] = it_[2];
- if ((it_[0] & 0xff) < 0xF0)
- return 3;
- utf8_output[3] = it_[3];
- return 4;
- }
- string UnicodeText::const_iterator::get_utf8_string() const {
- return string(utf8_data(), utf8_length());
- }
- int UnicodeText::const_iterator::utf8_length() const {
- if ((it_[0] & 0xff) < 0x80) {
- return 1;
- } else if ((it_[0] & 0xff) < 0xE0) {
- return 2;
- } else if ((it_[0] & 0xff) < 0xF0) {
- return 3;
- } else {
- return 4;
- }
- }
- UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
- CHECK(p != nullptr);
- const char *start = utf8_data();
- int len = utf8_length();
- const char *end = start + len;
- CHECK(p >= start);
- CHECK(p <= end);
- CHECK(p == end || !UniLib::IsTrailByte(*p));
- return const_iterator(p);
- }
- #ifdef INCLUDE_TENSORFLOW
- string UnicodeText::const_iterator::DebugString() const {
- return tensorflow::strings::Printf("{iter %p}", it_);
- }
- // *************************** Utilities *************************
- string CodepointString(const UnicodeText &t) {
- string s;
- UnicodeText::const_iterator it = t.begin(), end = t.end();
- while (it != end)
- tensorflow::strings::Appendf(&s, "%X ", *it++);
- return s;
- }
- #endif
|