tokenizer.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. /* $NetBSD: tokenizer.c,v 1.10 2002/03/18 16:01:00 christos Exp $ */
  2. /*-
  3. * Copyright (c) 1992, 1993
  4. * The Regents of the University of California. All rights reserved.
  5. *
  6. * This code is derived from software contributed to Berkeley by
  7. * Christos Zoulas of Cornell University.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in the
  16. * documentation and/or other materials provided with the distribution.
  17. * 3. All advertising materials mentioning features or use of this software
  18. * must display the following acknowledgement:
  19. * This product includes software developed by the University of
  20. * California, Berkeley and its contributors.
  21. * 4. Neither the name of the University nor the names of its contributors
  22. * may be used to endorse or promote products derived from this software
  23. * without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35. * SUCH DAMAGE.
  36. */
  37. #include "config.h"
  38. #if !defined(lint) && !defined(SCCSID)
  39. #if 0
  40. static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93";
  41. #else
  42. __RCSID("$NetBSD: tokenizer.c,v 1.10 2002/03/18 16:01:00 christos Exp $");
  43. #endif
  44. #endif /* not lint && not SCCSID */
  45. /*
  46. * tokenize.c: Bourne shell like tokenizer
  47. */
  48. #include <string.h>
  49. #include <stdlib.h>
  50. #include "tokenizer.h"
  51. typedef enum {
  52. Q_none, Q_single, Q_double, Q_one, Q_doubleone
  53. } quote_t;
  54. #define IFS "\t \n"
  55. #define TOK_KEEP 1
  56. #define TOK_EAT 2
  57. #define WINCR 20
  58. #define AINCR 10
  59. #define tok_malloc(a) malloc(a)
  60. #define tok_free(a) free(a)
  61. #define tok_realloc(a, b) realloc(a, b)
  62. struct tokenizer {
  63. char *ifs; /* In field separator */
  64. int argc, amax; /* Current and maximum number of args */
  65. char **argv; /* Argument list */
  66. char *wptr, *wmax; /* Space and limit on the word buffer */
  67. char *wstart; /* Beginning of next word */
  68. char *wspace; /* Space of word buffer */
  69. quote_t quote; /* Quoting state */
  70. int flags; /* flags; */
  71. };
  72. private void tok_finish(Tokenizer *);
  73. /* tok_finish():
  74. * Finish a word in the tokenizer.
  75. */
  76. private void
  77. tok_finish(Tokenizer *tok)
  78. {
  79. *tok->wptr = '\0';
  80. if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
  81. tok->argv[tok->argc++] = tok->wstart;
  82. tok->argv[tok->argc] = NULL;
  83. tok->wstart = ++tok->wptr;
  84. }
  85. tok->flags &= ~TOK_KEEP;
  86. }
  87. /* tok_init():
  88. * Initialize the tokenizer
  89. */
  90. public Tokenizer *
  91. tok_init(const char *ifs)
  92. {
  93. Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer));
  94. tok->ifs = strdup(ifs ? ifs : IFS);
  95. tok->argc = 0;
  96. tok->amax = AINCR;
  97. tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
  98. if (tok->argv == NULL) {
  99. tok_free(tok);
  100. return (NULL);
  101. }
  102. tok->argv[0] = NULL;
  103. tok->wspace = (char *) tok_malloc(WINCR);
  104. if (tok->wspace == NULL) {
  105. tok_free(tok->argv);
  106. tok_free(tok);
  107. return (NULL);
  108. }
  109. tok->wmax = tok->wspace + WINCR;
  110. tok->wstart = tok->wspace;
  111. tok->wptr = tok->wspace;
  112. tok->flags = 0;
  113. tok->quote = Q_none;
  114. return (tok);
  115. }
  116. /* tok_reset():
  117. * Reset the tokenizer
  118. */
  119. public void
  120. tok_reset(Tokenizer *tok)
  121. {
  122. tok->argc = 0;
  123. tok->wstart = tok->wspace;
  124. tok->wptr = tok->wspace;
  125. tok->flags = 0;
  126. tok->quote = Q_none;
  127. }
  128. /* tok_end():
  129. * Clean up
  130. */
  131. public void
  132. tok_end(Tokenizer *tok)
  133. {
  134. tok_free((ptr_t) tok->ifs);
  135. tok_free((ptr_t) tok->wspace);
  136. tok_free((ptr_t) tok->argv);
  137. tok_free((ptr_t) tok);
  138. }
  139. /* tok_line():
  140. * Bourne shell like tokenizing
  141. * Return:
  142. * -1: Internal error
  143. * 3: Quoted return
  144. * 2: Unmatched double quote
  145. * 1: Unmatched single quote
  146. * 0: Ok
  147. */
  148. public int
  149. tok_line(Tokenizer *tok, const char *line, int *argc, const char ***argv)
  150. {
  151. const char *ptr;
  152. for (;;) {
  153. switch (*(ptr = line++)) {
  154. case '\'':
  155. tok->flags |= TOK_KEEP;
  156. tok->flags &= ~TOK_EAT;
  157. switch (tok->quote) {
  158. case Q_none:
  159. tok->quote = Q_single; /* Enter single quote
  160. * mode */
  161. break;
  162. case Q_single: /* Exit single quote mode */
  163. tok->quote = Q_none;
  164. break;
  165. case Q_one: /* Quote this ' */
  166. tok->quote = Q_none;
  167. *tok->wptr++ = *ptr;
  168. break;
  169. case Q_double: /* Stay in double quote mode */
  170. *tok->wptr++ = *ptr;
  171. break;
  172. case Q_doubleone: /* Quote this ' */
  173. tok->quote = Q_double;
  174. *tok->wptr++ = *ptr;
  175. break;
  176. default:
  177. return (-1);
  178. }
  179. break;
  180. case '"':
  181. tok->flags &= ~TOK_EAT;
  182. tok->flags |= TOK_KEEP;
  183. switch (tok->quote) {
  184. case Q_none: /* Enter double quote mode */
  185. tok->quote = Q_double;
  186. break;
  187. case Q_double: /* Exit double quote mode */
  188. tok->quote = Q_none;
  189. break;
  190. case Q_one: /* Quote this " */
  191. tok->quote = Q_none;
  192. *tok->wptr++ = *ptr;
  193. break;
  194. case Q_single: /* Stay in single quote mode */
  195. *tok->wptr++ = *ptr;
  196. break;
  197. case Q_doubleone: /* Quote this " */
  198. tok->quote = Q_double;
  199. *tok->wptr++ = *ptr;
  200. break;
  201. default:
  202. return (-1);
  203. }
  204. break;
  205. case '\\':
  206. tok->flags |= TOK_KEEP;
  207. tok->flags &= ~TOK_EAT;
  208. switch (tok->quote) {
  209. case Q_none: /* Quote next character */
  210. tok->quote = Q_one;
  211. break;
  212. case Q_double: /* Quote next character */
  213. tok->quote = Q_doubleone;
  214. break;
  215. case Q_one: /* Quote this, restore state */
  216. *tok->wptr++ = *ptr;
  217. tok->quote = Q_none;
  218. break;
  219. case Q_single: /* Stay in single quote mode */
  220. *tok->wptr++ = *ptr;
  221. break;
  222. case Q_doubleone: /* Quote this \ */
  223. tok->quote = Q_double;
  224. *tok->wptr++ = *ptr;
  225. break;
  226. default:
  227. return (-1);
  228. }
  229. break;
  230. case '\n':
  231. tok->flags &= ~TOK_EAT;
  232. switch (tok->quote) {
  233. case Q_none:
  234. tok_finish(tok);
  235. *argv = (const char **)tok->argv;
  236. *argc = tok->argc;
  237. return (0);
  238. case Q_single:
  239. case Q_double:
  240. *tok->wptr++ = *ptr; /* Add the return */
  241. break;
  242. case Q_doubleone: /* Back to double, eat the '\n' */
  243. tok->flags |= TOK_EAT;
  244. tok->quote = Q_double;
  245. break;
  246. case Q_one: /* No quote, more eat the '\n' */
  247. tok->flags |= TOK_EAT;
  248. tok->quote = Q_none;
  249. break;
  250. default:
  251. return (0);
  252. }
  253. break;
  254. case '\0':
  255. switch (tok->quote) {
  256. case Q_none:
  257. /* Finish word and return */
  258. if (tok->flags & TOK_EAT) {
  259. tok->flags &= ~TOK_EAT;
  260. return (3);
  261. }
  262. tok_finish(tok);
  263. *argv = (const char **)tok->argv;
  264. *argc = tok->argc;
  265. return (0);
  266. case Q_single:
  267. return (1);
  268. case Q_double:
  269. return (2);
  270. case Q_doubleone:
  271. tok->quote = Q_double;
  272. *tok->wptr++ = *ptr;
  273. break;
  274. case Q_one:
  275. tok->quote = Q_none;
  276. *tok->wptr++ = *ptr;
  277. break;
  278. default:
  279. return (-1);
  280. }
  281. break;
  282. default:
  283. tok->flags &= ~TOK_EAT;
  284. switch (tok->quote) {
  285. case Q_none:
  286. if (strchr(tok->ifs, *ptr) != NULL)
  287. tok_finish(tok);
  288. else
  289. *tok->wptr++ = *ptr;
  290. break;
  291. case Q_single:
  292. case Q_double:
  293. *tok->wptr++ = *ptr;
  294. break;
  295. case Q_doubleone:
  296. *tok->wptr++ = '\\';
  297. tok->quote = Q_double;
  298. *tok->wptr++ = *ptr;
  299. break;
  300. case Q_one:
  301. tok->quote = Q_none;
  302. *tok->wptr++ = *ptr;
  303. break;
  304. default:
  305. return (-1);
  306. }
  307. break;
  308. }
  309. if (tok->wptr >= tok->wmax - 4) {
  310. size_t size = tok->wmax - tok->wspace + WINCR;
  311. char *s = (char *) tok_realloc(tok->wspace, size);
  312. if (s == NULL)
  313. return (-1);
  314. if (s != tok->wspace) {
  315. int i;
  316. for (i = 0; i < tok->argc; i++) {
  317. tok->argv[i] =
  318. (tok->argv[i] - tok->wspace) + s;
  319. }
  320. tok->wptr = (tok->wptr - tok->wspace) + s;
  321. tok->wstart = (tok->wstart - tok->wspace) + s;
  322. tok->wspace = s;
  323. }
  324. tok->wmax = s + size;
  325. }
  326. if (tok->argc >= tok->amax - 4) {
  327. char **p;
  328. tok->amax += AINCR;
  329. p = (char **) tok_realloc(tok->argv,
  330. tok->amax * sizeof(char *));
  331. if (p == NULL)
  332. return (-1);
  333. tok->argv = p;
  334. }
  335. }
  336. }