lex.l文件:
%{
#include <stdio.h>
#include <string.h>
#include <io.h>
#include "lexsymb.h"
#include "parse.cpp.h"
%}
LETTER [a-zA-Z_]
DIGIT [0-9]
IDENT {LETTER}({LETTER}|{DIGIT})*
STR \"[^\"]*\"
CHAR '{LETTER}'
WSPACE [ \t]+
INTEGER [-]*{DIGIT}+
bool "true"|"false"
%%
{WSPACE} ;/* Eat all space */
\n {lineno++;}
"=" {return ASSIGN;}
"==" {return EQUAL;}
"!=" {return NOTEQUAL;}
"if" {return IF;}
"else" {return ELSE;}
"print" {return PRINT;}
"input" {return INPUT;}
"+" {return CONCAT;}
";" {return END_STMT;}
"(" {return OPEN_PAR;}
")" {return CLOSE_PAR;}
"{" {return BEGIN_CS;}
"}" {return END_CS;}
"-" {return SUB;}
"/" {return DIV;}
"*" {return MULT;}
"var" {return VAR;}
"," {return COMMA;}
{bool} { printf("bool value\n"); };
{INTEGER} {
yylval.str = new char[strlen(yytext) + 1];
strcpy(yylval.str, yytext);
return INTEGER;
}
{CHAR} {int iSize = strlen(yytext) - 1;
yylval.str = new char[iSize];
strncpy(yylval.str, &yytext[1], iSize - 1);
yylval.str[iSize - 1] = 0;
return STRING;
}
{STR} { int iSize = strlen(yytext) - 1;
yylval.str = new char[iSize];
strncpy(yylval.str, &yytext[1], iSize - 1);
yylval.str[iSize - 1] = 0;
return STRING;
}
{IDENT} {
yylval.str = new char[strlen(yytext) + 1];
strcpy(yylval.str, yytext);
return ID;
}
. {printf("Undefined symbo at line %d\n", lineno);return ERROR_TOKEN;}
%%
int yywrap()
{
return 1;
}
/* YYSTYPE yylval; */
int lineno = 0;
/*int main()
{
while ( yylex() )
{
}
return 0;
}
*/
#include <stdio.h>
#include <string.h>
#include <io.h>
#include "lexsymb.h"
#include "parse.cpp.h"
%}
LETTER [a-zA-Z_]
DIGIT [0-9]
IDENT {LETTER}({LETTER}|{DIGIT})*
STR \"[^\"]*\"
CHAR '{LETTER}'
WSPACE [ \t]+
INTEGER [-]*{DIGIT}+
bool "true"|"false"
%%
{WSPACE} ;/* Eat all space */
\n {lineno++;}
"=" {return ASSIGN;}
"==" {return EQUAL;}
"!=" {return NOTEQUAL;}
"if" {return IF;}
"else" {return ELSE;}
"print" {return PRINT;}
"input" {return INPUT;}
"+" {return CONCAT;}
";" {return END_STMT;}
"(" {return OPEN_PAR;}
")" {return CLOSE_PAR;}
"{" {return BEGIN_CS;}
"}" {return END_CS;}
"-" {return SUB;}
"/" {return DIV;}
"*" {return MULT;}
"var" {return VAR;}
"," {return COMMA;}
{bool} { printf("bool value\n"); };
{INTEGER} {
yylval.str = new char[strlen(yytext) + 1];
strcpy(yylval.str, yytext);
return INTEGER;
}
{CHAR} {int iSize = strlen(yytext) - 1;
yylval.str = new char[iSize];
strncpy(yylval.str, &yytext[1], iSize - 1);
yylval.str[iSize - 1] = 0;
return STRING;
}
{STR} { int iSize = strlen(yytext) - 1;
yylval.str = new char[iSize];
strncpy(yylval.str, &yytext[1], iSize - 1);
yylval.str[iSize - 1] = 0;
return STRING;
}
{IDENT} {
yylval.str = new char[strlen(yytext) + 1];
strcpy(yylval.str, yytext);
return ID;
}
. {printf("Undefined symbo at line %d\n", lineno);return ERROR_TOKEN;}
%%
int yywrap()
{
return 1;
}
/* YYSTYPE yylval; */
int lineno = 0;
/*int main()
{
while ( yylex() )
{
}
return 0;
}
*/
parse.y
%union {
char* str;
}
%{
#include <stdio.h>
#include <string.h>
#include <io.h>
#include <malloc.h>
#include "lexsymb.h"
%}
%token ERROR_TOKEN
%token IF
%token ELSE
%token PRINT
%token INPUT
%token ASSIGN
%token EQUAL
%token NOTEQUAL
%token END_STMT
%token OPEN_PAR
%token CLOSE_PAR
%token BEGIN_CS
%token END_CS
%token ID
%token INTEGER
%token STRING
%token CONCAT
%token SUB
%token MULT
%token DIV
%token VAR
%token COMMA
%start program
%%
program:
| program statement { printf("program found!\n"); }
;
statement: paramd END_STMT {printf("variable declaration!\n");}
| func END_STMT {printf("Call function!\n");}
| funcd END_STMT {printf("Declare function!\n");}
;
func: ID OPEN_PAR CLOSE_PAR {}
| ID OPEN_PAR paramlist CLOSE_PAR { printf("func %s is found\n", $1); }
;
paramlist: param {}
| param COMMA param {}
;
param: ID { printf("param name:%s \n", $1);}
;
funcd: ID OPEN_PAR CLOSE_PAR {}
| ID OPEN_PAR paramlistd CLOSE_PAR { printf("func %s is declared\n", $1); }
;
paramlistd: paramd {}
| paramd COMMA paramd {}
;
paramd: VAR ID { printf("param name:%s \n", $1); }
%%
extern FILE* yyin;
int main()
{
//yyin = fopen("sample.txt", "r");
yyparse();
return 0;
}
int yyerror(char *msg)
{
printf("Error encountered: %s \n", msg);
return 0;
}
char* str;
}
%{
#include <stdio.h>
#include <string.h>
#include <io.h>
#include <malloc.h>
#include "lexsymb.h"
%}
%token ERROR_TOKEN
%token IF
%token ELSE
%token PRINT
%token INPUT
%token ASSIGN
%token EQUAL
%token NOTEQUAL
%token END_STMT
%token OPEN_PAR
%token CLOSE_PAR
%token BEGIN_CS
%token END_CS
%token ID
%token INTEGER
%token STRING
%token CONCAT
%token SUB
%token MULT
%token DIV
%token VAR
%token COMMA
%start program
%%
program:
| program statement { printf("program found!\n"); }
;
statement: paramd END_STMT {printf("variable declaration!\n");}
| func END_STMT {printf("Call function!\n");}
| funcd END_STMT {printf("Declare function!\n");}
;
func: ID OPEN_PAR CLOSE_PAR {}
| ID OPEN_PAR paramlist CLOSE_PAR { printf("func %s is found\n", $1); }
;
paramlist: param {}
| param COMMA param {}
;
param: ID { printf("param name:%s \n", $1);}
;
funcd: ID OPEN_PAR CLOSE_PAR {}
| ID OPEN_PAR paramlistd CLOSE_PAR { printf("func %s is declared\n", $1); }
;
paramlistd: paramd {}
| paramd COMMA paramd {}
;
paramd: VAR ID { printf("param name:%s \n", $1); }
%%
extern FILE* yyin;
int main()
{
//yyin = fopen("sample.txt", "r");
yyparse();
return 0;
}
int yyerror(char *msg)
{
printf("Error encountered: %s \n", msg);
return 0;
}
这个语法文件的主要功能是解析出常见的语句:函数调用,函数声明,变量声明。
我的理解如下:
在第二部分规则这里,填写的所有规则,其实就是一颗语法解析树,如果有哪个规则没有放到树里面,bison编译就会提示contains 2 useless nonterminals and 4 useless rules,而且没有放到树里面的规则不会被使用来进行语法解析,所以在bison指令中有一个%start用来指示从哪一个规则开始(也就是说哪一个是语法解析树的根),可以尝试一下把%start program 改成 %start funcd,那么只有
fool(var i)
这样的输入是有效的,而
fool(i);
fool(var i);
这样的输入都是无效的了。
statement: paramd END_STMT {printf("variable declaration!\n");}
| func END_STMT {printf("Call function!\n");}
| funcd END_STMT {printf("Declare function!\n");}
;
这一段代码是非终结符statement的语法规则,| func END_STMT {printf("Call function!\n");}
| funcd END_STMT {printf("Declare function!\n");}
;
在yacc里面一个语法规则是由如下方式构成的:
非终结符名称 : 语法规则体 ;
非终结符名称可以自己随意取,语法规则体里面可以用{}来添加控制代码,比如我这里的{printf("variable declaration!\n");}
语法规则如果有多种表现形式,可以用 '|' 来隔开,比如statement就可以有三种表达方式,第一种是paramd 后面加一个分号,第二种是func后面接分号,第三种是funcd后面接分号。
非终结符paramd形式:
var i
var name
非终结符func形式:
foo(i)
foo(i, j)
非终结符funcd形式:
test(var i)
hello(var i, var j)
非终结符paramlist形式:
i, j, k, l
param1, param2, param3
非终结符paramlistd形式:
var i, var j, var k
param和paramd就不写了,太简单了。
接下来的任务是: 完成整个语法解析功能和构造一棵语法树, 目前的打算是把语法弄成跟javascript相似,不过数据类型方面可能会增加一些。