1 /* GLIB - Library of useful routines for C programming
2 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
4 * GScanner: Flexible lexical scanner for general purpose.
5 * Copyright (C) 1997 Tim Janik
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public
18 * License along with this library; if not, write to the Free
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #define __gscanner_c__
32 #define to_lower(c) ( \
34 ( (((guchar)(c))>='A' && ((guchar)(c))<='Z') * ('a'-'A') ) + \
35 ( (((guchar)(c))>=192 && ((guchar)(c))<=214) * (224-192) ) + \
36 ( (((guchar)(c))>=216 && ((guchar)(c))<=222) * (248-216) ) + \
42 /* --- typedefs --- */
43 typedef struct _GScannerHashVal GScannerHashVal;
45 struct _GScannerHashVal
53 /* --- variables --- */
54 static GScannerConfig g_scanner_config_template =
58 ) /* cset_skip_characters */,
63 ) /* cset_identifier_first */,
70 ) /* cset_identifier_nth */,
71 ( "#\n" ) /* cpair_comment_single */,
73 FALSE /* case_sensitive */,
75 TRUE /* skip_comment_multi */,
76 TRUE /* skip_comment_single */,
77 TRUE /* scan_comment_multi */,
78 TRUE /* scan_identifier */,
79 FALSE /* scan_identifier_1char */,
80 FALSE /* scan_identifier_NULL */,
81 TRUE /* scan_symbols */,
82 FALSE /* scan_binary */,
83 TRUE /* scan_octal */,
84 TRUE /* scan_float */,
86 FALSE /* scan_hex_dollar */,
87 TRUE /* scan_string_sq */,
88 TRUE /* scan_string_dq */,
89 TRUE /* numbers_2_int */,
90 FALSE /* int_2_float */,
91 FALSE /* identifier_2_string */,
92 TRUE /* char_2_token */,
93 FALSE /* symbol_2_token */,
97 /* --- prototypes --- */
98 static GScannerHashVal* g_scanner_lookup_internal (GScanner *scanner,
100 static void g_scanner_get_token_ll (GScanner *scanner,
105 static void g_scanner_get_token_i (GScanner *scanner,
110 static void g_scanner_free_value (GTokenType *token_p,
114 gint g_scanner_char_2_num (guchar c,
116 static guchar g_scanner_peek_next_char(GScanner *scanner);
117 static guchar g_scanner_get_char (GScanner *scanner,
122 /* --- functions --- */
124 g_scanner_char_2_num (guchar c,
127 if (c >= '0' && c <= '9')
129 else if (c >= 'A' && c <= 'Z')
131 else if (c >= 'a' && c <= 'z')
143 g_scanner_new (GScannerConfig *config_templ)
145 register GScanner *scanner;
148 config_templ = &g_scanner_config_template;
150 scanner = g_new0 (GScanner, 1);
152 scanner->user_data = NULL;
153 scanner->input_name = NULL;
154 scanner->parse_errors = 0;
155 scanner->max_parse_errors = 0;
157 scanner->config = g_new0 (GScannerConfig, 1);
159 scanner->config->case_sensitive = config_templ->case_sensitive;
160 scanner->config->cset_skip_characters = config_templ->cset_skip_characters;
161 scanner->config->cset_identifier_first= config_templ->cset_identifier_first;
162 scanner->config->cset_identifier_nth = config_templ->cset_identifier_nth;
163 scanner->config->cpair_comment_single = config_templ->cpair_comment_single;
164 scanner->config->skip_comment_multi = config_templ->skip_comment_multi;
165 scanner->config->skip_comment_single = config_templ->skip_comment_single;
166 scanner->config->scan_comment_multi = config_templ->scan_comment_multi;
167 scanner->config->scan_identifier = config_templ->scan_identifier;
168 scanner->config->scan_identifier_1char= config_templ->scan_identifier_1char;
169 scanner->config->scan_identifier_NULL = config_templ->scan_identifier_NULL;
170 scanner->config->scan_symbols = config_templ->scan_symbols;
171 scanner->config->scan_binary = config_templ->scan_binary;
172 scanner->config->scan_octal = config_templ->scan_octal;
173 scanner->config->scan_float = config_templ->scan_float;
174 scanner->config->scan_hex = config_templ->scan_hex;
175 scanner->config->scan_hex_dollar = config_templ->scan_hex_dollar;
176 scanner->config->scan_string_sq = config_templ->scan_string_sq;
177 scanner->config->scan_string_dq = config_templ->scan_string_dq;
178 scanner->config->numbers_2_int = config_templ->numbers_2_int;
179 scanner->config->int_2_float = config_templ->int_2_float;
180 scanner->config->identifier_2_string = config_templ->identifier_2_string;
181 scanner->config->char_2_token = config_templ->char_2_token;
182 scanner->config->symbol_2_token = config_templ->symbol_2_token;
184 scanner->token = G_TOKEN_NONE;
185 scanner->value.v_int = 0;
187 scanner->position = 0;
189 scanner->next_token = G_TOKEN_NONE;
190 scanner->next_value.v_int = 0;
191 scanner->next_line = 1;
192 scanner->next_position = 0;
194 scanner->symbol_table = g_hash_table_new (g_str_hash, g_str_equal);
195 scanner->text = NULL;
196 scanner->text_len = 0;
197 scanner->input_fd = -1;
198 scanner->peeked_char = -1;
204 g_scanner_destroy_symbol_table_entry (gpointer key,
213 g_scanner_destroy (GScanner *scanner)
215 g_return_if_fail (scanner != NULL);
217 g_hash_table_foreach (scanner->symbol_table,
218 g_scanner_destroy_symbol_table_entry, NULL);
219 g_hash_table_destroy (scanner->symbol_table);
220 g_scanner_free_value (&scanner->token, &scanner->value);
221 g_scanner_free_value (&scanner->next_token, &scanner->next_value);
222 g_free (scanner->config);
227 g_scanner_input_file (GScanner *scanner,
230 g_return_if_fail (input_fd >= 0);
232 scanner->token = G_TOKEN_NONE;
233 scanner->value.v_int = 0;
235 scanner->position = 0;
236 scanner->next_token = G_TOKEN_NONE;
238 scanner->text = NULL;
239 scanner->text_len = 0;
240 scanner->input_fd = input_fd;
241 scanner->peeked_char = -1;
245 g_scanner_input_text (GScanner *scanner,
249 g_return_if_fail (text != NULL);
251 scanner->token = G_TOKEN_NONE;
252 scanner->value.v_int = 0;
254 scanner->position = 0;
255 scanner->next_token = G_TOKEN_NONE;
257 scanner->text = text;
258 scanner->text_len = text_len;
259 scanner->input_fd = -1;
260 scanner->peeked_char = -1;
264 g_scanner_add_symbol (GScanner *scanner,
268 register GScannerHashVal *hash_val;
270 g_return_if_fail (symbol != NULL);
271 g_return_if_fail (scanner != NULL);
273 hash_val = g_scanner_lookup_internal (scanner, symbol);
277 hash_val = g_new (GScannerHashVal, 1);
278 hash_val->key = g_strdup (symbol);
279 hash_val->value = value;
280 if (!scanner->config->case_sensitive)
284 l = strlen (hash_val->key);
285 for (i = 0; i < l; i++)
286 hash_val->key[i] = to_lower (hash_val->key[i]);
288 g_hash_table_insert (scanner->symbol_table, hash_val->key, hash_val);
291 hash_val->value = value;
295 g_scanner_lookup_symbol (GScanner *scanner,
298 register GScannerHashVal *hash_val;
300 g_return_val_if_fail (scanner != NULL, NULL);
305 hash_val = g_scanner_lookup_internal (scanner, symbol);
308 return hash_val->value;
314 g_scanner_remove_symbol (GScanner *scanner,
317 register GScannerHashVal *hash_val;
319 hash_val = g_scanner_lookup_internal (scanner, symbol);
323 g_hash_table_remove (scanner->symbol_table, hash_val->key);
324 g_free (hash_val->key);
330 g_scanner_peek_next_token (GScanner *scanner)
332 g_return_val_if_fail (scanner != NULL, G_TOKEN_EOF);
334 if (scanner->next_token == G_TOKEN_NONE)
336 scanner->next_line = scanner->line;
337 scanner->next_position = scanner->position;
338 g_scanner_get_token_i (scanner,
339 &scanner->next_token,
340 &scanner->next_value,
342 &scanner->next_position);
345 return scanner->next_token;
349 g_scanner_get_next_token (GScanner *scanner)
351 g_return_val_if_fail (scanner != NULL, G_TOKEN_EOF);
353 if (scanner->next_token != G_TOKEN_NONE)
355 g_scanner_free_value (&scanner->token, &scanner->value);
357 scanner->token = scanner->next_token;
358 scanner->value = scanner->next_value;
359 scanner->line = scanner->next_line;
360 scanner->position = scanner->next_position;
361 scanner->next_token = G_TOKEN_NONE;
364 g_scanner_get_token_i (scanner,
370 return scanner->token;
374 g_scanner_cur_token (GScanner *scanner)
376 g_return_val_if_fail (scanner != NULL, G_TOKEN_EOF);
378 return scanner->token;
382 g_scanner_cur_value (GScanner *scanner)
387 g_return_val_if_fail (scanner != NULL, v);
389 return scanner->value;
393 g_scanner_cur_line (GScanner *scanner)
395 g_return_val_if_fail (scanner != NULL, 0);
397 return scanner->line;
401 g_scanner_cur_position (GScanner *scanner)
403 g_return_val_if_fail (scanner != NULL, 0);
405 return scanner->position;
409 g_scanner_eof (GScanner *scanner)
411 g_return_val_if_fail (scanner != NULL, TRUE);
413 return scanner->token == G_TOKEN_EOF;
416 static GScannerHashVal*
417 g_scanner_lookup_internal (GScanner *scanner,
420 register GScannerHashVal *hash_val;
422 if (!scanner->config->case_sensitive)
424 register gchar *buffer;
428 buffer = g_new (gchar, l + 1);
429 for (i = 0; i < l; i++)
430 buffer[i] = to_lower (symbol[i]);
432 hash_val = g_hash_table_lookup (scanner->symbol_table, buffer);
436 hash_val = g_hash_table_lookup (scanner->symbol_table, (gchar*) symbol);
442 g_scanner_peek_next_char (GScanner *scanner)
446 if (scanner->text_len)
448 fchar = scanner->text[0];
450 else if (scanner->input_fd >= 0)
452 if (scanner->peeked_char < 0)
458 count = read (scanner->input_fd, &fchar, 1);
460 while (count == -1 &&
467 scanner->peeked_char = fchar;
470 fchar = scanner->peeked_char;
479 g_scanner_get_char (GScanner *scanner,
485 if (scanner->text_len)
487 fchar = *(scanner->text++);
490 else if (scanner->input_fd >= 0)
492 if (scanner->peeked_char < 0)
498 count = read (scanner->input_fd, &fchar, 1);
500 while (count == -1 &&
503 if (count != 1 || fchar == 0)
506 scanner->peeked_char = 0;
511 fchar = scanner->peeked_char;
513 scanner->peeked_char = -1;
533 g_scanner_free_value (GTokenType *token_p,
539 case G_TOKEN_IDENTIFIER:
540 case G_TOKEN_IDENTIFIER_NULL:
541 case G_TOKEN_COMMENT_SINGLE:
542 case G_TOKEN_COMMENT_MULTI:
543 g_free (value_p->v_string);
550 *token_p = G_TOKEN_NONE;
554 g_scanner_get_token_i (GScanner *scanner,
562 g_scanner_free_value (token_p, value_p);
563 g_scanner_get_token_ll (scanner, token_p, value_p, line_p, position_p);
565 while (((*token_p > 0 && *token_p < 256) &&
566 strchr (scanner->config->cset_skip_characters, *token_p)) ||
567 (*token_p == G_TOKEN_CHAR &&
568 strchr (scanner->config->cset_skip_characters, value_p->v_char)) ||
569 (*token_p == G_TOKEN_COMMENT_MULTI &&
570 scanner->config->skip_comment_multi) ||
571 (*token_p == G_TOKEN_COMMENT_SINGLE &&
572 scanner->config->skip_comment_single));
576 case G_TOKEN_IDENTIFIER:
577 if (scanner->config->identifier_2_string)
578 *token_p = G_TOKEN_STRING;
582 if (scanner->config->symbol_2_token)
583 *token_p = (GTokenType) value_p->v_symbol;
589 if (scanner->config->numbers_2_int)
590 *token_p = G_TOKEN_INT;
597 if (*token_p == G_TOKEN_INT &&
598 scanner->config->int_2_float)
600 *token_p = G_TOKEN_FLOAT;
601 value_p->v_float = value_p->v_int;
608 g_scanner_get_token_ll (GScanner *scanner,
614 register GScannerConfig *config;
615 register gboolean in_comment_multi;
616 register gboolean in_comment_single;
617 register gboolean in_string_sq;
618 register gboolean in_string_dq;
620 register GTokenType token;
621 register GValue value;
622 register GString *gstring;
624 config = scanner->config;
625 (*value_p).v_int = 0;
627 if (scanner->token == G_TOKEN_EOF ||
628 (!scanner->text_len &&
629 (scanner->input_fd < 0 ||
630 scanner->peeked_char == 0)))
632 *token_p = G_TOKEN_EOF;
636 in_comment_multi = FALSE;
637 in_comment_single = FALSE;
638 in_string_sq = FALSE;
639 in_string_dq = FALSE;
644 register gboolean dotted_float = FALSE;
646 ch = g_scanner_get_char (scanner, line_p, position_p);
649 token = G_TOKEN_NONE;
651 /* this is *evil*, but needed ;(
652 * we first check for identifier first character, because it
653 * might interfere with other key chars like slashes or numbers
655 if (config->scan_identifier &&
656 ch && strchr (config->cset_identifier_first, ch))
657 goto identifier_precedence;
661 register gboolean in_number;
662 static gchar *endptr;
671 if (!config->scan_comment_multi ||
672 g_scanner_peek_next_char (scanner) != '*')
674 g_scanner_get_char (scanner, line_p, position_p);
675 token = G_TOKEN_COMMENT_MULTI;
676 in_comment_multi = TRUE;
677 gstring = g_string_new ("");
678 while ((ch = g_scanner_get_char (scanner, line_p, position_p)) != 0)
680 if (ch == '*' && g_scanner_peek_next_char (scanner) == '/')
682 g_scanner_get_char (scanner, line_p, position_p);
683 in_comment_multi = FALSE;
687 gstring = g_string_append_c (gstring, ch);
693 if (!config->scan_string_sq)
695 token = G_TOKEN_STRING;
697 gstring = g_string_new ("");
698 while ((ch = g_scanner_get_char (scanner, line_p, position_p)) != 0)
702 in_string_sq = FALSE;
706 gstring = g_string_append_c (gstring, ch);
712 if (!config->scan_string_dq)
714 token = G_TOKEN_STRING;
716 gstring = g_string_new ("");
717 while ((ch = g_scanner_get_char (scanner, line_p, position_p)) != 0)
721 in_string_dq = FALSE;
728 ch = g_scanner_get_char (scanner, line_p, position_p);
732 register guint fchar;
738 gstring = g_string_append_c (gstring, '\\');
742 gstring = g_string_append_c (gstring, '\n');
746 gstring = g_string_append_c (gstring, '\t');
750 gstring = g_string_append_c (gstring, '\r');
754 gstring = g_string_append_c (gstring, '\b');
758 gstring = g_string_append_c (gstring, '\f');
770 fchar = g_scanner_peek_next_char (scanner);
771 if (fchar >= '0' && fchar <= '7')
773 ch = g_scanner_get_char (scanner, line_p, position_p);
775 fchar = g_scanner_peek_next_char (scanner);
776 if (fchar >= '0' && fchar <= '7')
778 ch = g_scanner_get_char (scanner, line_p, position_p);
779 i = i * 8 + ch - '0';
782 gstring = g_string_append_c (gstring, i);
786 gstring = g_string_append_c (gstring, ch);
791 gstring = g_string_append_c (gstring, ch);
798 if (!config->scan_float)
800 token = G_TOKEN_FLOAT;
802 ch = g_scanner_get_char (scanner, line_p, position_p);
806 if (!config->scan_hex_dollar)
809 ch = g_scanner_get_char (scanner, line_p, position_p);
813 if (config->scan_octal)
814 token = G_TOKEN_OCTAL;
817 ch = g_scanner_peek_next_char (scanner);
818 if (config->scan_hex && (ch == 'x' || ch == 'X'))
821 g_scanner_get_char (scanner, line_p, position_p);
822 ch = g_scanner_get_char (scanner, line_p, position_p);
825 token = G_TOKEN_ERROR;
826 value.v_error = G_ERR_UNEXP_EOF;
830 if (g_scanner_char_2_num (ch, 16) < 0)
832 token = G_TOKEN_ERROR;
833 value.v_error = G_ERR_DIGIT_RADIX;
838 else if (config->scan_binary && (ch == 'b' || ch == 'B'))
840 token = G_TOKEN_BINARY;
841 g_scanner_get_char (scanner, line_p, position_p);
842 ch = g_scanner_get_char (scanner, line_p, position_p);
845 token = G_TOKEN_ERROR;
846 value.v_error = G_ERR_UNEXP_EOF;
850 if (g_scanner_char_2_num (ch, 10) < 0)
852 token = G_TOKEN_ERROR;
853 value.v_error = G_ERR_NON_DIGIT_IN_CONST;
871 if (token == G_TOKEN_NONE)
874 gstring = g_string_new (dotted_float ? "0." : "");
875 gstring = g_string_append_c (gstring, ch);
879 register gboolean is_E;
881 is_E = (ch == 'e' || ch == 'E') && token == G_TOKEN_FLOAT;
882 ch = g_scanner_peek_next_char (scanner);
884 if (g_scanner_char_2_num (ch, 36) >= 0 ||
885 (config->scan_float && ch == '.') ||
886 (is_E && ch == '+') ||
887 (is_E && ch == '-') )
888 ch = g_scanner_get_char (scanner, line_p, position_p);
896 if (token != G_TOKEN_INT &&
897 token != G_TOKEN_OCTAL)
899 token = G_TOKEN_ERROR;
900 if (token == G_TOKEN_FLOAT)
901 value.v_error = G_ERR_FLOAT_MALFORMED;
903 value.v_error = G_ERR_FLOAT_RADIX;
908 token = G_TOKEN_FLOAT;
909 gstring = g_string_append_c (gstring, ch);
923 gstring = g_string_append_c (gstring, ch);
928 if (token != G_TOKEN_FLOAT)
930 token = G_TOKEN_ERROR;
931 value.v_error = G_ERR_NON_DIGIT_IN_CONST;
935 gstring = g_string_append_c (gstring, ch);
940 if ((token != G_TOKEN_HEX && !config->scan_float) ||
941 (token != G_TOKEN_HEX &&
942 token != G_TOKEN_OCTAL &&
943 token != G_TOKEN_FLOAT &&
944 token != G_TOKEN_INT))
946 token = G_TOKEN_ERROR;
947 value.v_error = G_ERR_NON_DIGIT_IN_CONST;
952 if (token != G_TOKEN_HEX)
953 token = G_TOKEN_FLOAT;
954 gstring = g_string_append_c (gstring, ch);
959 if (token != G_TOKEN_HEX)
961 token = G_TOKEN_ERROR;
962 value.v_error = G_ERR_NON_DIGIT_IN_CONST;
966 gstring = g_string_append_c (gstring, ch);
974 value.v_binary = strtol (gstring->str, &endptr, 2);
978 value.v_octal = strtol (gstring->str, &endptr, 8);
982 value.v_int = strtol (gstring->str, &endptr, 10);
986 value.v_float = g_strtod (gstring->str, &endptr);
990 value.v_hex = strtol (gstring->str, &endptr, 16);
996 if (endptr && *endptr)
998 token = G_TOKEN_ERROR;
999 if (*endptr == 'e' || *endptr == 'E')
1000 value.v_error = G_ERR_NON_DIGIT_IN_CONST;
1002 value.v_error = G_ERR_DIGIT_RADIX;
1004 g_string_free (gstring, TRUE);
1011 if (config->cpair_comment_single &&
1012 ch == config->cpair_comment_single[0])
1014 token = G_TOKEN_COMMENT_SINGLE;
1015 in_comment_single = TRUE;
1016 gstring = g_string_new ("");
1017 while ((ch = g_scanner_get_char (scanner,
1021 if (ch == config->cpair_comment_single[1])
1023 in_comment_single = FALSE;
1028 gstring = g_string_append_c (gstring, ch);
1032 else if (config->scan_identifier && ch &&
1033 strchr (config->cset_identifier_first, ch))
1035 identifier_precedence:
1037 if (config->cset_identifier_nth && ch &&
1038 strchr (config->cset_identifier_nth,
1039 g_scanner_peek_next_char (scanner)))
1041 token = G_TOKEN_IDENTIFIER;
1042 gstring = g_string_new ("");
1043 gstring = g_string_append_c (gstring, ch);
1046 ch = g_scanner_get_char (scanner, line_p, position_p);
1047 gstring = g_string_append_c (gstring, ch);
1048 ch = g_scanner_peek_next_char (scanner);
1050 while (ch && strchr (config->cset_identifier_nth, ch));
1053 else if (config->scan_identifier_1char)
1055 token = G_TOKEN_IDENTIFIER;
1056 value.v_identifier = g_new0 (gchar, 2);
1057 value.v_identifier[0] = ch;
1063 if (config->char_2_token)
1067 token = G_TOKEN_CHAR;
1074 g_assert (ch == 0 && token != G_TOKEN_NONE);
1078 if (in_comment_multi ||
1079 in_comment_single ||
1083 token = G_TOKEN_ERROR;
1086 g_string_free (gstring, TRUE);
1090 if (in_comment_multi || in_comment_single)
1091 value.v_error = G_ERR_UNEXP_EOF_IN_COMMENT;
1092 else if (in_string_sq || in_string_dq)
1093 value.v_error = G_ERR_UNEXP_EOF_IN_STRING;
1098 value.v_string = gstring->str;
1099 g_string_free (gstring, FALSE);
1103 if (token == G_TOKEN_IDENTIFIER &&
1104 config->scan_symbols)
1106 register GScannerHashVal *hash_val;
1108 hash_val = g_scanner_lookup_internal (scanner, value.v_identifier);
1112 g_free (value.v_identifier);
1113 token = G_TOKEN_SYMBOL;
1114 value.v_symbol = hash_val->value;
1118 if (token == G_TOKEN_IDENTIFIER &&
1119 config->scan_identifier_NULL &&
1120 strlen (value.v_identifier) == 4)
1122 gchar *null_upper = "NULL";
1123 gchar *null_lower = "null";
1125 if (scanner->config->case_sensitive)
1127 if (value.v_identifier[0] == null_upper[0] &&
1128 value.v_identifier[1] == null_upper[1] &&
1129 value.v_identifier[2] == null_upper[2] &&
1130 value.v_identifier[3] == null_upper[3])
1131 token = G_TOKEN_IDENTIFIER_NULL;
1135 if ((value.v_identifier[0] == null_upper[0] ||
1136 value.v_identifier[0] == null_lower[0]) &&
1137 (value.v_identifier[1] == null_upper[1] ||
1138 value.v_identifier[1] == null_lower[1]) &&
1139 (value.v_identifier[2] == null_upper[2] ||
1140 value.v_identifier[2] == null_lower[2]) &&
1141 (value.v_identifier[3] == null_upper[3] ||
1142 value.v_identifier[3] == null_lower[3]))
1143 token = G_TOKEN_IDENTIFIER_NULL;