fork download
  1. /* get_src.c */
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <ctype.h>
  5. #include <stdlib.h>
  6.  
  7. #define GET_SOURCE
  8. //#include "get_src.h"
  9.  
  10. /* get_src.h */
  11.  
  12. #ifndef GET_SRC_INCLUDED
  13. #define GET_SRC_INCLUDED
  14.  
  15. #include <stdio.h>
  16.  
  17. #ifdef __cplusplus
  18. extern "C" {
  19. #endif
  20.  
  21. /* This is the size of the largest token we'll attempt to deal with. If
  22.  * you want to deal with bigger tokens, change this, and recompile
  23.  * get_src.c. Note that an entire comment is treated as a single token,
  24.  * so long comments could overflow this. In case of an overflow, the
  25.  * entire comment will be read as a single token, but the part larger
  26.  * than this will not be stored.
  27.  */
  28. #define MAX_TOKEN_SIZE 8192
  29.  
  30. /* `last_token' will contain the text of the most recently read token.
  31.  */
  32. extern char last_token[];
  33.  
  34. /* This is the maximum number of characters that can be put back into a
  35.  * file opened with parse_fopen or parse_fdopen.
  36.  */
  37. #define MAX_UNGETS 5
  38.  
  39. #include <limits.h>
  40. #include <stdio.h>
  41.  
  42. typedef struct {
  43. FILE *file;
  44. char peeks[MAX_UNGETS];
  45. int last_peek;
  46. } PFILE;
  47.  
  48. /* Some codes we return to indicate having found various items in the
  49.  * source code. ERROR is returned to indicate a newline found in the
  50.  * middle of a character or string literal or if a file ends inside a
  51.  * comment, or if a character literal contains more than two characters.
  52.  *
  53.  * Note that this starts at INT_MIN, the most negative number available
  54.  * in an int. This keeps these symbols from conflicting with any
  55.  * characters read from the file. However, one of these could
  56.  * theoretically conflict with EOF. EOF usually -1, and these are far
  57.  * more negative than that. However, officially EOF can be any value
  58.  * less than 0...
  59.  */
  60. enum {
  61. ERROR = INT_MIN,
  62. COMMENT,
  63. CHAR_LIT,
  64. STR_LIT,
  65. IDENT, /* This is for any word except those below... */
  66. CASE,
  67. DEFAULT,
  68. DO,
  69. ELSE,
  70. IF,
  71. SWITCH,
  72. WHILE,
  73. INCLUDE,
  74. DEFINE
  75. };
  76.  
  77. /* Opens a file for parsing and returns a pointer to a structure which
  78.  * can be passed to the other functions in the parser/lexer to identify
  79.  * the file being worked with.
  80.  */
  81. PFILE *parse_fopen(char const *name);
  82.  
  83. /* This corresponds closely to fdopen - it takes a FILE * as its
  84.  * only parameter, creates a PFILE structure identifying that file, and
  85.  * returns a pointer to that structure.
  86.  */
  87. PFILE *parse_fdopen(FILE *stream);
  88.  
  89. /* Corresponds to fclose.
  90.  */
  91. int parse_fclose(PFILE *stream);
  92.  
  93. /* returns characters from `stream' read as C source code. String
  94.  * literals, characters literals and comments are each returned as a
  95.  * single code from those above. All strings of any kind of whitespace
  96.  * are returned as a single space character.
  97.  */
  98. int get_source(PFILE *stream);
  99.  
  100. /* As above, but adds classification of some C keywords as well. The
  101.  * keywords recognized are mostly for flow control and are those listed
  102.  * in the enumeration above, following IDENT. All identifiers and
  103.  * unrecognized keywords are returned as IDENT.
  104.  */
  105. int get_token(PFILE *stream);
  106.  
  107. /* If called with a value of 0, turns off recognition of C++ digraphs
  108.  * and single line comments. If called with a non-zero value, turns
  109.  * on recognition of same. Default is 1.
  110.  */
  111. //void read_CPP(int status);
  112.  
  113. /* Basically, these two work just like the normal versions of the same,
  114.  * with the minor exception that unget_character can unget more than one
  115.  * character.
  116.  */
  117. int get_character(PFILE *stream);
  118. void unget_character(int ch, PFILE *stream);
  119.  
  120. #ifdef __cplusplus
  121. }
  122. #endif
  123.  
  124. #endif
  125.  
  126. #if 1
  127. /* These are the keywords we recognize - those that involve flow control.
  128.  * recognition of all keywords simply involves adding them to this list,
  129.  * and adding matching identifiers to the enumeration in get_src.h. The
  130.  * matching enumerators for these keywords start after IDENT in the
  131.  * enumeration, but MUST be maintained in the same order as the keywords
  132.  * appear here. E.g. if "case" remains the first keyword here, `CASE'
  133.  * must follow immediately after IDENT in the enumeration. Any types
  134.  * added to the enumeration that do not have matching keywords should
  135.  * precede `IDENT'.
  136.  */
  137. static char *keys[] = {
  138. "case",
  139. "default",
  140. "do",
  141. "else",
  142. "if",
  143. "switch",
  144. "while",
  145. "#include",
  146. "#define"
  147. };
  148.  
  149. #define elems(x) (sizeof(x) / sizeof(x[0]))
  150. #endif
  151.  
  152. static size_t current = 0;
  153.  
  154. char last_token[MAX_TOKEN_SIZE];
  155.  
  156. PFILE *parse_fopen(char const *name) {
  157.  
  158. PFILE *temp = malloc(sizeof(PFILE));
  159.  
  160. if ( NULL != temp ) {
  161. temp->file = fopen(name, "r");
  162. memset(temp->peeks, 0, sizeof(temp->peeks));
  163. temp->last_peek = 0;
  164. }
  165. return temp;
  166. }
  167.  
  168. PFILE *parse_fdopen(FILE *file) {
  169.  
  170. PFILE *temp = malloc(sizeof(PFILE));
  171.  
  172. if ( NULL != temp) {
  173. temp->file = file;
  174. memset(temp->peeks, 0, sizeof(temp->peeks));
  175. temp->last_peek = 0;
  176. }
  177. return temp;
  178. }
  179.  
  180. int parse_fclose(PFILE *stream) {
  181.  
  182. int retval = fclose(stream->file);
  183.  
  184. free(stream);
  185. return retval;
  186. }
  187.  
  188. static void addchar(int ch) {
  189. /* adds the passed character to the end of `last_token' */
  190.  
  191. if ( current < sizeof(last_token) -1 )
  192. last_token[current++] = (char)ch;
  193.  
  194. if ( current == sizeof(last_token)-1 )
  195. last_token[current] = '\0';
  196. }
  197.  
  198. static void clear(void) {
  199. /* clears the previous token and starts building a new one. */
  200. current = 0;
  201. }
  202.  
  203. static int read_char(PFILE *stream) {
  204. if ( stream->last_peek > 0 )
  205. return stream->peeks[--stream->last_peek];
  206. return fgetc(stream->file);
  207. }
  208.  
  209. void unget_character(int ch, PFILE * stream) {
  210. if ( stream->last_peek < sizeof(stream->peeks) )
  211. stream->peeks[stream->last_peek++] = ch;
  212. }
  213.  
  214. /* Here's where we start getting into sort of sophisticated stuff.
  215.  */
  216.  
  217. static int check_trigraph(PFILE *stream) {
  218. /* Checks for trigraphs and returns the equivalant character if there
  219.  * is one. Expects that the leading '?' of the trigraph has already
  220.  * been read before this is called.
  221.  */
  222.  
  223. int ch;
  224.  
  225. if ( '?' != (ch=read_char(stream))) {
  226. unget_character(ch, stream);
  227. return '?';
  228. }
  229.  
  230. ch = read_char(stream);
  231.  
  232. switch( ch ) {
  233. case '(': return '[';
  234. case ')': return ']';
  235. case '/': return '\\';
  236. case '\'': return '^';
  237. case '<': return '{';
  238. case '>': return '}';
  239. case '!': return '|';
  240. case '-': return '~';
  241. case '=': return '#';
  242. default:
  243. unget_character('?', stream);
  244. unget_character(ch, stream);
  245. return '?';
  246. }
  247. }
  248.  
  249. static int check_digraph(PFILE *stream, int first) {
  250. /* Checks for a digraph. The first character of the digraph is
  251.  * transmitted as the second parameter, as there are several possible
  252.  * first characters of a digraph.
  253.  */
  254.  
  255. int ch = read_char(stream);
  256.  
  257. switch(first) {
  258. case '<':
  259. if ( '%' == ch )
  260. return '{';
  261. if ( ':' == ch )
  262. return '[';
  263. break;
  264. case ':':
  265. if ( '>' == ch )
  266. return ']';
  267. break;
  268. case '%':
  269. if ( '>' == ch )
  270. return '}';
  271. if ( ':' == ch )
  272. return '#';
  273. break;
  274. }
  275.  
  276. /* If it's not one of the specific combos above, return the characters
  277.  * separately and unchanged by putting the second one back into the
  278.  * stream, and returning the first one as-is.
  279.  */
  280. unget_character(ch, stream);
  281. return first;
  282. }
  283.  
  284. static int get_char(PFILE *stream) {
  285. /* Gets a single character from the stream with any trigraphs ( and if
  286.  * C++ support is turned on, digraphs ) converted to the single character
  287.  * represented.
  288.  */
  289. int ch = read_char(stream);
  290.  
  291. if ( ch == '?' )
  292. return check_trigraph(stream);
  293.  
  294. if (( ch == '<' || ch == ':' || ch == '%' ))
  295. return check_digraph(stream, ch);
  296.  
  297. return ch;
  298. }
  299.  
  300. int get_character(PFILE *stream) {
  301. /* gets a character from `stream'. Any amount of any kind of whitespace
  302.  * is returned as a single space. Escaped new-lines are "eaten" here as well.
  303.  */
  304. int ch;
  305.  
  306. if ( !isspace(ch=get_char(stream)) && ch != '\\')
  307. return ch;
  308.  
  309. // handle line-slicing
  310. if (ch == '\\') {
  311. ch = get_char(stream);
  312. if (ch == '\n')
  313. ch = get_char(stream);
  314. else {
  315. unget_character(ch, stream);
  316. return ch;
  317. }
  318. }
  319.  
  320. /* If it's a space, skip over consecutive white-space */
  321. while (isspace(ch) && ('\n' != ch))
  322. ch = get_char(stream);
  323.  
  324. if ('\n' == ch)
  325. return ch;
  326.  
  327. /* Then put the non-ws character back */
  328. unget_character(ch, stream);
  329.  
  330. /* and return a single space character... */
  331. return ' ';
  332. }
  333.  
  334. static int read_char_lit(PFILE *stream) {
  335. /* This is used internally by `get_source' (below) - it expects the
  336.  * opening quote of a character literal to have already been read and
  337.  * returns CHAR_LIT or ERROR if there's a newline before a close
  338.  * quote is found, or if the character literal contains more than two
  339.  * characters after escapes are taken into account.
  340.  */
  341.  
  342. int ch;
  343. int i;
  344.  
  345.  
  346. clear();
  347. addchar('\'');
  348.  
  349. for (i=0; i<2 && ('\'' != ( ch = read_char(stream))); i++) {
  350.  
  351. addchar(ch);
  352.  
  353. if ( ch == '\n' )
  354. return ERROR;
  355.  
  356. if (ch == '\\' ) {
  357. ch = get_char(stream);
  358. addchar(ch);
  359. }
  360. }
  361. addchar('\'');
  362. addchar('\0');
  363.  
  364. if ( i > 2 )
  365. return ERROR;
  366.  
  367. return CHAR_LIT;
  368. }
  369.  
  370. static int read_str_lit(PFILE *stream) {
  371. /* Used internally by get_source. Expects the opening quote of a string
  372.  * literal to have already been read. Returns STR_LIT, or ERROR if a
  373.  * un-escaped newline is found before the close quote.
  374.  */
  375.  
  376. int ch;
  377.  
  378. clear();
  379. addchar('"');
  380.  
  381. while ( '"' != ( ch = get_char(stream))) {
  382.  
  383. if ( '\n' == ch || EOF == ch )
  384. return ERROR;
  385.  
  386. addchar(ch);
  387.  
  388. if( ch == '\\' ) {
  389. ch = read_char(stream);
  390. addchar(ch);
  391. }
  392.  
  393. }
  394.  
  395. addchar('"');
  396. addchar('\0');
  397.  
  398. return STR_LIT;
  399. }
  400.  
  401. static int read_comment(PFILE *stream) {
  402. /* Skips over a comment in stream. Assumes the leading '/' has already
  403.  * been read and skips over the body. If we're reading C++ source, skips
  404.  * C++ single line comments as well as normal C comments.
  405.  */
  406. int ch;
  407.  
  408. clear();
  409.  
  410. ch = get_char(stream);
  411.  
  412. /* Handle a single line comment.
  413.   */
  414. if ('/' == ch) {
  415. addchar('/');
  416. addchar('/');
  417.  
  418. while ( '\n' != ( ch = get_char(stream)))
  419. addchar(ch);
  420.  
  421. addchar('\0');
  422. return COMMENT;
  423. }
  424.  
  425. if ('*' != ch ) {
  426. unget_character(ch, stream);
  427. return '/';
  428. }
  429.  
  430. addchar('/');
  431.  
  432. do {
  433. addchar(ch);
  434. while ('*' !=(ch = get_char(stream)))
  435. if (EOF == ch)
  436. return ERROR;
  437. else
  438. addchar(ch);
  439. addchar(ch);
  440. } while ( '/' != (ch=get_char(stream)));
  441.  
  442. addchar('/');
  443. addchar('\0');
  444.  
  445. return COMMENT;
  446. }
  447.  
  448. int get_source(PFILE *stream) {
  449. /* reads and returns a single "item" from the stream. An "item" is a
  450.  * comment, a literal or a single character after trigraph and possible
  451.  * digraph substitution has taken place.
  452.  */
  453.  
  454. int ch = get_character(stream);
  455.  
  456. switch(ch) {
  457. case '\'':
  458. return read_char_lit(stream);
  459. case '"':
  460. return read_str_lit(stream);
  461. case '/':
  462. return read_comment(stream);
  463. default:
  464. return ch;
  465. }
  466. }
  467. #if TOKEN
  468. int get_token(PFILE *stream) {
  469. /* This gets a single token from the input stream and places the text
  470.  * of the token in last_token, and returns an identifier of the type of
  471.  * the token. Only flow control keywords are recognized individually.
  472.  * All other keywords are simply returned as IDENT's, just like other
  473.  * identifiers.
  474.  */
  475.  
  476. int ch;
  477. int i;
  478.  
  479. ch = get_source(stream);
  480.  
  481. /* If we've got an identifier, read as many characters as can
  482.   * possibly constitute the identifier ( maximal munch ) and build
  483.   * up the complete identifier in `last_token'
  484.   */
  485. if ( ch > 0 && ('_' == ch || isalpha(ch))) {
  486. clear();
  487. while(ch > 0 && (isalpha(ch) || isdigit(ch) || '_' == ch )) {
  488. addchar(ch);
  489. ch = get_source(stream);
  490. }
  491. unget_character(ch,stream);
  492.  
  493. addchar('\0');
  494.  
  495. /* Now we look in our table to see if we've got a keyword
  496.   * we recognize, or some random identifier.
  497.   */
  498. for (i=0;i<elems(keys);i++) {
  499. if ( 0 == strcmp(last_token, keys[i]))
  500. return IDENT+i+1;
  501. }
  502.  
  503. /* we didn't recognize it - it must be a normal identifier. */
  504. return IDENT;
  505. }
  506.  
  507. /* it's not an identifier - just return it as a character. */
  508. return ch;
  509. }
  510. #endif
  511. #ifdef TEST
  512.  
  513. int main(int argc, char **argv) {
  514. PFILE *f;
  515. int ch;
  516.  
  517. if (argc != 2) {
  518. fprintf(stderr, "Usage: get_src <filename>\n");
  519. return EXIT_FAILURE;
  520. }
  521.  
  522. if (NULL==(f= parse_fopen(argv[1]))) {
  523. fprintf(stderr, "Unable to open: %s\n", argv[1]);
  524. return EXIT_FAILURE;
  525. }
  526.  
  527. while (EOF!=(ch=get_source(f)))
  528. if (ch < 0)
  529. printf("\n%s\n", last_token);
  530. else
  531. printf("%c", ch);
  532. parse_fclose(f);
  533. return 0;
  534. }
  535.  
  536. #endif
  537.  
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp:147:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings]
prog.cpp: In function ‘PFILE* parse_fopen(const char*)’:
prog.cpp:158:39: error: invalid conversion from ‘void*’ to ‘PFILE*’ [-fpermissive]
prog.cpp: In function ‘PFILE* parse_fdopen(FILE*)’:
prog.cpp:170:39: error: invalid conversion from ‘void*’ to ‘PFILE*’ [-fpermissive]
prog.cpp: In function ‘void unget_character(int, PFILE*)’:
prog.cpp:210:50: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
prog.cpp: At global scope:
prog.cpp:137:14: warning: ‘keys’ defined but not used [-Wunused-variable]
stdout
Standard output is empty