utf.h 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. /*
  2. * The authors of this software are Rob Pike and Ken Thompson.
  3. * Copyright (c) 2002 by Lucent Technologies.
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose without fee is hereby granted, provided that this entire notice
  6. * is included in all copies of any software which is or includes a copy
  7. * or modification of this software and in all copies of the supporting
  8. * documentation for such software.
  9. * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
  10. * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
  11. * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
  12. * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
  13. */
  14. #ifndef _UTFH_
  15. #define _UTFH_ 1
  16. #include <stdint.h>
  17. typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
  18. enum {
  19. UTFmax = 4, /* maximum bytes per rune */
  20. Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
  21. Runeself = 0x80, /* rune and UTF sequences are the same (<) */
  22. Runeerror = 0xFFFD, /* decoding error in UTF */
  23. Runemax = 0x10FFFF, /* maximum rune value */
  24. };
  25. #ifdef __cplusplus
  26. extern "C" {
  27. #endif
  28. /*
  29. * rune routines
  30. */
  31. /*
  32. * These routines were written by Rob Pike and Ken Thompson
  33. * and first appeared in Plan 9.
  34. * SEE ALSO
  35. * utf (7)
  36. * tcs (1)
  37. */
  38. // runetochar copies (encodes) one rune, pointed to by r, to at most
  39. // UTFmax bytes starting at s and returns the number of bytes generated.
  40. int runetochar(char *s, const Rune *r);
  41. // chartorune copies (decodes) at most UTFmax bytes starting at s to
  42. // one rune, pointed to by r, and returns the number of bytes consumed.
  43. // If the input is not exactly in UTF format, chartorune will set *r
  44. // to Runeerror and return 1.
  45. //
  46. // Note: There is no special case for a "null-terminated" string. A
  47. // string whose first byte has the value 0 is the UTF8 encoding of the
  48. // Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
  49. // anywhere else in a UTF sequence.
  50. int chartorune(Rune *r, const char *s);
  51. // charntorune is like chartorune, except that it will access at most
  52. // n bytes of s. If the UTF sequence is incomplete within n bytes,
  53. // charntorune will set *r to Runeerror and return 0. If it is complete
  54. // but not in UTF format, it will set *r to Runeerror and return 1.
  55. //
  56. // Added 2004-09-24 by Wei-Hwa Huang
  57. int charntorune(Rune *r, const char *s, int n);
  58. // isvalidcharntorune(str, n, r, consumed)
  59. // is a convenience function that calls "*consumed = charntorune(r, str, n)"
  60. // and returns an int (logically boolean) indicating whether the first
  61. // n bytes of str was a valid and complete UTF sequence.
  62. int isvalidcharntorune(const char *str, int n, Rune *r, int *consumed);
  63. // runelen returns the number of bytes required to convert r into UTF.
  64. int runelen(Rune r);
  65. // runenlen returns the number of bytes required to convert the n
  66. // runes pointed to by r into UTF.
  67. int runenlen(const Rune *r, int n);
  68. // fullrune returns 1 if the string s of length n is long enough to be
  69. // decoded by chartorune, and 0 otherwise. This does not guarantee
  70. // that the string contains a legal UTF encoding. This routine is used
  71. // by programs that obtain input one byte at a time and need to know
  72. // when a full rune has arrived.
  73. int fullrune(const char *s, int n);
  74. // The following routines are analogous to the corresponding string
  75. // routines with "utf" substituted for "str", and "rune" substituted
  76. // for "chr".
  77. // utflen returns the number of runes that are represented by the UTF
  78. // string s. (cf. strlen)
  79. int utflen(const char *s);
  80. // utfnlen returns the number of complete runes that are represented
  81. // by the first n bytes of the UTF string s. If the last few bytes of
  82. // the string contain an incompletely coded rune, utfnlen will not
  83. // count them; in this way, it differs from utflen, which includes
  84. // every byte of the string. (cf. strnlen)
  85. int utfnlen(const char *s, long n);
  86. // utfrune returns a pointer to the first occurrence of rune r in the
  87. // UTF string s, or 0 if r does not occur in the string. The NULL
  88. // byte terminating a string is considered to be part of the string s.
  89. // (cf. strchr)
  90. const char *utfrune(const char *s, Rune r);
  91. // utfrrune returns a pointer to the last occurrence of rune r in the
  92. // UTF string s, or 0 if r does not occur in the string. The NULL
  93. // byte terminating a string is considered to be part of the string s.
  94. // (cf. strrchr)
  95. const char *utfrrune(const char *s, Rune r);
  96. // utfutf returns a pointer to the first occurrence of the UTF string
  97. // s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
  98. // null string, utfutf returns s1. (cf. strstr)
  99. const char *utfutf(const char *s1, const char *s2);
  100. // utfecpy copies UTF sequences until a null sequence has been copied,
  101. // but writes no sequences beyond es1. If any sequences are copied,
  102. // s1 is terminated by a null sequence, and a pointer to that sequence
  103. // is returned. Otherwise, the original s1 is returned. (cf. strecpy)
  104. char *utfecpy(char *s1, char *es1, const char *s2);
  105. // These functions are rune-string analogues of the corresponding
  106. // functions in strcat (3).
  107. //
  108. // These routines first appeared in Plan 9.
  109. // SEE ALSO
  110. // memmove (3)
  111. // rune (3)
  112. // strcat (2)
  113. //
  114. // BUGS: The outcome of overlapping moves varies among implementations.
  115. Rune *runestrcat(Rune *s1, const Rune *s2);
  116. Rune *runestrncat(Rune *s1, const Rune *s2, long n);
  117. const Rune *runestrchr(const Rune *s, Rune c);
  118. int runestrcmp(const Rune *s1, const Rune *s2);
  119. int runestrncmp(const Rune *s1, const Rune *s2, long n);
  120. Rune *runestrcpy(Rune *s1, const Rune *s2);
  121. Rune *runestrncpy(Rune *s1, const Rune *s2, long n);
  122. Rune *runestrecpy(Rune *s1, Rune *es1, const Rune *s2);
  123. Rune *runestrdup(const Rune *s);
  124. const Rune *runestrrchr(const Rune *s, Rune c);
  125. long runestrlen(const Rune *s);
  126. const Rune *runestrstr(const Rune *s1, const Rune *s2);
  127. // The following routines test types and modify cases for Unicode
  128. // characters. Unicode defines some characters as letters and
  129. // specifies three cases: upper, lower, and title. Mappings among the
  130. // cases are also defined, although they are not exhaustive: some
  131. // upper case letters have no lower case mapping, and so on. Unicode
  132. // also defines several character properties, a subset of which are
  133. // checked by these routines. These routines are based on Unicode
  134. // version 3.0.0.
  135. //
  136. // NOTE: The routines are implemented in C, so the boolean functions
  137. // (e.g., isupperrune) return 0 for false and 1 for true.
  138. //
  139. //
  140. // toupperrune, tolowerrune, and totitlerune are the Unicode case
  141. // mappings. These routines return the character unchanged if it has
  142. // no defined mapping.
  143. Rune toupperrune(Rune r);
  144. Rune tolowerrune(Rune r);
  145. Rune totitlerune(Rune r);
  146. // isupperrune tests for upper case characters, including Unicode
  147. // upper case letters and targets of the toupper mapping. islowerrune
  148. // and istitlerune are defined analogously.
  149. int isupperrune(Rune r);
  150. int islowerrune(Rune r);
  151. int istitlerune(Rune r);
  152. // isalpharune tests for Unicode letters; this includes ideographs in
  153. // addition to alphabetic characters.
  154. int isalpharune(Rune r);
  155. // isdigitrune tests for digits. Non-digit numbers, such as Roman
  156. // numerals, are not included.
  157. int isdigitrune(Rune r);
  158. // isideographicrune tests for ideographic characters and numbers, as
  159. // defined by the Unicode standard.
  160. int isideographicrune(Rune r);
  161. // isspacerune tests for whitespace characters, including "C" locale
  162. // whitespace, Unicode defined whitespace, and the "zero-width
  163. // non-break space" character.
  164. int isspacerune(Rune r);
  165. // (The comments in this file were copied from the manpage files rune.3,
  166. // isalpharune.3, and runestrcat.3. Some formatting changes were also made
  167. // to conform to Google style. /JRM 11/11/05)
  168. #ifdef __cplusplus
  169. }
  170. #endif
  171. #endif