unistr.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. /*
  2. * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
  3. *
  4. * Copyright (c) 2001-2006 Anton Altaparmakov
  5. *
  6. * This program/include file is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License as published
  8. * by the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program/include file is distributed in the hope that it will be
  12. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  13. * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program (in the main directory of the Linux-NTFS
  18. * distribution in the file COPYING); if not, write to the Free Software
  19. * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  20. */
  21. #include <linux/slab.h>
  22. #include "types.h"
  23. #include "debug.h"
  24. #include "ntfs.h"
  25. /*
  26. * IMPORTANT
  27. * =========
  28. *
  29. * All these routines assume that the Unicode characters are in little endian
  30. * encoding inside the strings!!!
  31. */
  32. /*
  33. * This is used by the name collation functions to quickly determine what
  34. * characters are (in)valid.
  35. */
  36. static const u8 legal_ansi_char_array[0x40] = {
  37. 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  38. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  39. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  40. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  41. 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
  42. 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
  43. 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
  44. 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
  45. };
  46. /**
  47. * ntfs_are_names_equal - compare two Unicode names for equality
  48. * @s1: name to compare to @s2
  49. * @s1_len: length in Unicode characters of @s1
  50. * @s2: name to compare to @s1
  51. * @s2_len: length in Unicode characters of @s2
  52. * @ic: ignore case bool
  53. * @upcase: upcase table (only if @ic == IGNORE_CASE)
  54. * @upcase_size: length in Unicode characters of @upcase (if present)
  55. *
  56. * Compare the names @s1 and @s2 and return 'true' (1) if the names are
  57. * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
  58. * the @upcase table is used to performa a case insensitive comparison.
  59. */
  60. bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
  61. const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
  62. const ntfschar *upcase, const u32 upcase_size)
  63. {
  64. if (s1_len != s2_len)
  65. return false;
  66. if (ic == CASE_SENSITIVE)
  67. return !ntfs_ucsncmp(s1, s2, s1_len);
  68. return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
  69. }
  70. /**
  71. * ntfs_collate_names - collate two Unicode names
  72. * @name1: first Unicode name to compare
  73. * @name2: second Unicode name to compare
  74. * @err_val: if @name1 contains an invalid character return this value
  75. * @ic: either CASE_SENSITIVE or IGNORE_CASE
  76. * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
  77. * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
  78. *
  79. * ntfs_collate_names collates two Unicode names and returns:
  80. *
  81. * -1 if the first name collates before the second one,
  82. * 0 if the names match,
  83. * 1 if the second name collates before the first one, or
  84. * @err_val if an invalid character is found in @name1 during the comparison.
  85. *
  86. * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
  87. */
  88. int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
  89. const ntfschar *name2, const u32 name2_len,
  90. const int err_val, const IGNORE_CASE_BOOL ic,
  91. const ntfschar *upcase, const u32 upcase_len)
  92. {
  93. u32 cnt, min_len;
  94. u16 c1, c2;
  95. min_len = name1_len;
  96. if (name1_len > name2_len)
  97. min_len = name2_len;
  98. for (cnt = 0; cnt < min_len; ++cnt) {
  99. c1 = le16_to_cpu(*name1++);
  100. c2 = le16_to_cpu(*name2++);
  101. if (ic) {
  102. if (c1 < upcase_len)
  103. c1 = le16_to_cpu(upcase[c1]);
  104. if (c2 < upcase_len)
  105. c2 = le16_to_cpu(upcase[c2]);
  106. }
  107. if (c1 < 64 && legal_ansi_char_array[c1] & 8)
  108. return err_val;
  109. if (c1 < c2)
  110. return -1;
  111. if (c1 > c2)
  112. return 1;
  113. }
  114. if (name1_len < name2_len)
  115. return -1;
  116. if (name1_len == name2_len)
  117. return 0;
  118. /* name1_len > name2_len */
  119. c1 = le16_to_cpu(*name1);
  120. if (c1 < 64 && legal_ansi_char_array[c1] & 8)
  121. return err_val;
  122. return 1;
  123. }
  124. /**
  125. * ntfs_ucsncmp - compare two little endian Unicode strings
  126. * @s1: first string
  127. * @s2: second string
  128. * @n: maximum unicode characters to compare
  129. *
  130. * Compare the first @n characters of the Unicode strings @s1 and @s2,
  131. * The strings in little endian format and appropriate le16_to_cpu()
  132. * conversion is performed on non-little endian machines.
  133. *
  134. * The function returns an integer less than, equal to, or greater than zero
  135. * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
  136. * to be less than, to match, or be greater than @s2.
  137. */
  138. int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
  139. {
  140. u16 c1, c2;
  141. size_t i;
  142. for (i = 0; i < n; ++i) {
  143. c1 = le16_to_cpu(s1[i]);
  144. c2 = le16_to_cpu(s2[i]);
  145. if (c1 < c2)
  146. return -1;
  147. if (c1 > c2)
  148. return 1;
  149. if (!c1)
  150. break;
  151. }
  152. return 0;
  153. }
  154. /**
  155. * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
  156. * @s1: first string
  157. * @s2: second string
  158. * @n: maximum unicode characters to compare
  159. * @upcase: upcase table
  160. * @upcase_size: upcase table size in Unicode characters
  161. *
  162. * Compare the first @n characters of the Unicode strings @s1 and @s2,
  163. * ignoring case. The strings in little endian format and appropriate
  164. * le16_to_cpu() conversion is performed on non-little endian machines.
  165. *
  166. * Each character is uppercased using the @upcase table before the comparison.
  167. *
  168. * The function returns an integer less than, equal to, or greater than zero
  169. * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
  170. * to be less than, to match, or be greater than @s2.
  171. */
  172. int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
  173. const ntfschar *upcase, const u32 upcase_size)
  174. {
  175. size_t i;
  176. u16 c1, c2;
  177. for (i = 0; i < n; ++i) {
  178. if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
  179. c1 = le16_to_cpu(upcase[c1]);
  180. if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
  181. c2 = le16_to_cpu(upcase[c2]);
  182. if (c1 < c2)
  183. return -1;
  184. if (c1 > c2)
  185. return 1;
  186. if (!c1)
  187. break;
  188. }
  189. return 0;
  190. }
  191. void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
  192. const u32 upcase_len)
  193. {
  194. u32 i;
  195. u16 u;
  196. for (i = 0; i < name_len; i++)
  197. if ((u = le16_to_cpu(name[i])) < upcase_len)
  198. name[i] = upcase[u];
  199. }
  200. void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
  201. const ntfschar *upcase, const u32 upcase_len)
  202. {
  203. ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
  204. file_name_attr->file_name_length, upcase, upcase_len);
  205. }
  206. int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
  207. FILE_NAME_ATTR *file_name_attr2,
  208. const int err_val, const IGNORE_CASE_BOOL ic,
  209. const ntfschar *upcase, const u32 upcase_len)
  210. {
  211. return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
  212. file_name_attr1->file_name_length,
  213. (ntfschar*)&file_name_attr2->file_name,
  214. file_name_attr2->file_name_length,
  215. err_val, ic, upcase, upcase_len);
  216. }
  217. /**
  218. * ntfs_nlstoucs - convert NLS string to little endian Unicode string
  219. * @vol: ntfs volume which we are working with
  220. * @ins: input NLS string buffer
  221. * @ins_len: length of input string in bytes
  222. * @outs: on return contains the allocated output Unicode string buffer
  223. *
  224. * Convert the input string @ins, which is in whatever format the loaded NLS
  225. * map dictates, into a little endian, 2-byte Unicode string.
  226. *
  227. * This function allocates the string and the caller is responsible for
  228. * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
  229. *
  230. * On success the function returns the number of Unicode characters written to
  231. * the output string *@outs (>= 0), not counting the terminating Unicode NULL
  232. * character. *@outs is set to the allocated output string buffer.
  233. *
  234. * On error, a negative number corresponding to the error code is returned. In
  235. * that case the output string is not allocated. Both *@outs and *@outs_len
  236. * are then undefined.
  237. *
  238. * This might look a bit odd due to fast path optimization...
  239. */
  240. int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
  241. const int ins_len, ntfschar **outs)
  242. {
  243. struct nls_table *nls = vol->nls_map;
  244. ntfschar *ucs;
  245. wchar_t wc;
  246. int i, o, wc_len;
  247. /* We do not trust outside sources. */
  248. if (likely(ins)) {
  249. ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
  250. if (likely(ucs)) {
  251. for (i = o = 0; i < ins_len; i += wc_len) {
  252. wc_len = nls->char2uni(ins + i, ins_len - i,
  253. &wc);
  254. if (likely(wc_len >= 0 &&
  255. o < NTFS_MAX_NAME_LEN)) {
  256. if (likely(wc)) {
  257. ucs[o++] = cpu_to_le16(wc);
  258. continue;
  259. } /* else if (!wc) */
  260. break;
  261. } /* else if (wc_len < 0 ||
  262. o >= NTFS_MAX_NAME_LEN) */
  263. goto name_err;
  264. }
  265. ucs[o] = 0;
  266. *outs = ucs;
  267. return o;
  268. } /* else if (!ucs) */
  269. ntfs_error(vol->sb, "Failed to allocate buffer for converted "
  270. "name from ntfs_name_cache.");
  271. return -ENOMEM;
  272. } /* else if (!ins) */
  273. ntfs_error(vol->sb, "Received NULL pointer.");
  274. return -EINVAL;
  275. name_err:
  276. kmem_cache_free(ntfs_name_cache, ucs);
  277. if (wc_len < 0) {
  278. ntfs_error(vol->sb, "Name using character set %s contains "
  279. "characters that cannot be converted to "
  280. "Unicode.", nls->charset);
  281. i = -EILSEQ;
  282. } else /* if (o >= NTFS_MAX_NAME_LEN) */ {
  283. ntfs_error(vol->sb, "Name is too long (maximum length for a "
  284. "name on NTFS is %d Unicode characters.",
  285. NTFS_MAX_NAME_LEN);
  286. i = -ENAMETOOLONG;
  287. }
  288. return i;
  289. }
  290. /**
  291. * ntfs_ucstonls - convert little endian Unicode string to NLS string
  292. * @vol: ntfs volume which we are working with
  293. * @ins: input Unicode string buffer
  294. * @ins_len: length of input string in Unicode characters
  295. * @outs: on return contains the (allocated) output NLS string buffer
  296. * @outs_len: length of output string buffer in bytes
  297. *
  298. * Convert the input little endian, 2-byte Unicode string @ins, of length
  299. * @ins_len into the string format dictated by the loaded NLS.
  300. *
  301. * If *@outs is NULL, this function allocates the string and the caller is
  302. * responsible for calling kfree(*@outs); when finished with it. In this case
  303. * @outs_len is ignored and can be 0.
  304. *
  305. * On success the function returns the number of bytes written to the output
  306. * string *@outs (>= 0), not counting the terminating NULL byte. If the output
  307. * string buffer was allocated, *@outs is set to it.
  308. *
  309. * On error, a negative number corresponding to the error code is returned. In
  310. * that case the output string is not allocated. The contents of *@outs are
  311. * then undefined.
  312. *
  313. * This might look a bit odd due to fast path optimization...
  314. */
  315. int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
  316. const int ins_len, unsigned char **outs, int outs_len)
  317. {
  318. struct nls_table *nls = vol->nls_map;
  319. unsigned char *ns;
  320. int i, o, ns_len, wc;
  321. /* We don't trust outside sources. */
  322. if (ins) {
  323. ns = *outs;
  324. ns_len = outs_len;
  325. if (ns && !ns_len) {
  326. wc = -ENAMETOOLONG;
  327. goto conversion_err;
  328. }
  329. if (!ns) {
  330. ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
  331. ns = kmalloc(ns_len + 1, GFP_NOFS);
  332. if (!ns)
  333. goto mem_err_out;
  334. }
  335. for (i = o = 0; i < ins_len; i++) {
  336. retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
  337. ns_len - o);
  338. if (wc > 0) {
  339. o += wc;
  340. continue;
  341. } else if (!wc)
  342. break;
  343. else if (wc == -ENAMETOOLONG && ns != *outs) {
  344. unsigned char *tc;
  345. /* Grow in multiples of 64 bytes. */
  346. tc = kmalloc((ns_len + 64) &
  347. ~63, GFP_NOFS);
  348. if (tc) {
  349. memcpy(tc, ns, ns_len);
  350. ns_len = ((ns_len + 64) & ~63) - 1;
  351. kfree(ns);
  352. ns = tc;
  353. goto retry;
  354. } /* No memory so goto conversion_error; */
  355. } /* wc < 0, real error. */
  356. goto conversion_err;
  357. }
  358. ns[o] = 0;
  359. *outs = ns;
  360. return o;
  361. } /* else (!ins) */
  362. ntfs_error(vol->sb, "Received NULL pointer.");
  363. return -EINVAL;
  364. conversion_err:
  365. ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
  366. "converted to character set %s. You might want to "
  367. "try to use the mount option nls=utf8.", nls->charset);
  368. if (ns != *outs)
  369. kfree(ns);
  370. if (wc != -ENAMETOOLONG)
  371. wc = -EILSEQ;
  372. return wc;
  373. mem_err_out:
  374. ntfs_error(vol->sb, "Failed to allocate name!");
  375. return -ENOMEM;
  376. }