zoukankan      html  css  js  c++  java
  • 编译器DIY——词法分析

    在上一篇文章中已经介绍了读文件的操作,那么这一篇文章中将会细致解释词法分析。

    在源文件里解析出的单词流必须识别为保留字,标识符,常量,操作符和界符五大类

    1.显然我们须要列举出全部的保留字,而这里与保留字相似的那么就是标识符,在C语言中,保留字都是以小写字母开头,并且当中的字母仅仅能是小写字母,而标识符的第一个字母则必须为字符(小写大写皆可)后面能够接大写和小写字母和字符 ‘_’, 在我写的这个编译器中,标识符不能超过100,在C语言中的标识符定义的长度大小远远大于此。

    2.对于常量,这里须要注意的是整型和浮点型常量。

    3.运算符依照的是以下的表:

    C语言运算符

    运算符依照优先级大小由上向下排列,在同一行的运算符具有同样优先级。第二行是全部的一元运算符。
     

    运算符
    解释
    结合方式
    () [] -> . 括号(函数等),数组,两种结构成员訪问
    由左向右
    ! ~ ++ -- + - 

    * &

    否定,按位否定,增量,减量,正负号,

    间接,取地址

    由右向左
    * / % 乘,除,取模
    由左向右
    + - 加,减
    由左向右
    << >> 左移,右移
    由左向右
    < <= >= > 小于,小于等于,大于等于,大于
    由左向右
    == != 等于,不等于
    由左向右
    & 按位与
    由左向右
    ^ 按位异或
    由左向右
    | 按位或
    由左向右
    && 逻辑与
    由左向右
    || 逻辑或
    由左向右
    ? : 条件
    由右向左
    = += -= *= /= 

    &= ^= |= <<= >>=

    各种赋值
    由右向左
    , 逗号(顺序)
    由左向右

    4.界符:“;”“{}”,单引號,双引號

    接下来我介绍的是对保留字的归类,为了查找方便,将保留字依照a-z的顺序排好,根据数组的下标定位,降低寻找的时间


    /*
     * keyword.h
     *
     *  Created on: Jun 12, 2014
     *    
     */
    
    #ifndef KEYWORD_H_
    #define KEYWORD_H_
    
    struct keyword{
    	char *keyName;
    };
    
    static struct keyword key__[]={
    		{"__int64"},
    		{"end"}
    };
    
    static struct keyword key_A[]={
    		{"auto"},
    		{"end"}
    };
    static struct keyword key_B[]={
    		{"break"},
    		{"end"}
    };
    static struct keyword key_C[]={
    		{"case"},
    		{"char"},
    		{"const"},
    		{"continue"},
    		{"end"}
    };
    static struct keyword key_D[]={
    		{"default"},
    		{"do"},
    		{"double"},
    		{"end"}
    };
    static struct keyword key_E[]={
    		{"else"},
    		{"enum"},
    		{"extern"},
    		{"end"}
    };
    static struct keyword key_F[]={
    		{"float"},
    		{"for"},
    		{"end"}
    };
    static struct keyword key_G[]={
    		{"goto"},
    		{"end"}
    };
    static struct keyword key_H[]={
    		{"end"}
    };
    static struct keyword key_I[]={
    		{"if"},
    		{"int"},
    		{"end"}
    };
    static struct keyword key_J[]={
    		{"end"}
    };
    static struct keyword key_K[]={
    		{"end"}
    };
    static struct keyword key_L[]={
    		{"long"},
    		{"end"}
    };
    static struct keyword key_M[]={
    		{"end"}
    };
    static struct keyword key_N[]={
    		{"end"}
    };
    static struct keyword key_O[]={
    		{"end"}
    };
    static struct keyword key_P[]={
    		{"end"}
    };
    static struct keyword key_Q[]={
    		{"end"}
    };
    static struct keyword key_R[]={
    		{"register"},
    		{"return"},
    		{"end"}
    };
    static struct keyword key_S[]={
    		{"short"},
    		{"signed"},
    		{"sizeof"},
    		{"static"},
    		{"struct"},
    		{"switch"},
    		{"end"}
    };
    static struct keyword key_T[]={
    		{"typedef"},
    		{"end"}
    };
    static struct keyword key_U[]={
    		{"union"},
    		{"unsigned"},
    		{"end"}
    };
    static struct keyword key_V[]={
    		{"void"},
    		{"volatile"},
    		{"end"}
    };
    static struct keyword key_W[]={
    		{"while"},
    		{"end"}
    };
    static struct keyword key_X[]={
    		{"end"}
    };
    static struct keyword key_Y[]={
    		{"end"}
    };
    static struct keyword key_Z[]={
    		{"end"}
    };
    // size is 27
    static struct keyword *keywords[]={
    		key__,key_A,key_B,key_C,key_D,key_E,
    		key_F,key_G,key_H,key_I,key_J,key_K,
    		key_L,key_M,key_N,key_O,key_P,key_Q,
    		key_R,key_S,key_T,key_U,key_V,key_W,
    		key_X,key_Y,key_Z
    };
    
    #endif /* KEYWORD_H_ */
    


    以下是词法分析的源代码;

    /*
     * lex.h
     *
     *  Created on: Jun 13, 2014
     *     
     */
    #include "input.h"
    #include "keyword.h"
    
    #define isDigit(c)			(c>='0' && c<='9')
    #define isUpperLetter(c)	(c>='A' && c <='Z')
    #define isLowerLetter(c)	(c>='a' && c<='z')
    #define isLetter(c)			(isUpperLetter || isLowerLetter)


    /*
     * lex.c
     *
     *  Created on: Jun 13, 2014
     *      
     */
    #include "zcc.h"
    #include "lex.h"
    
    #define curr source.cursor
    
    int getToken() {
    	char a[100];
    	int a_length, i, flag;
    	/*
    	 *skip ' ','
    ' and ''
    	 */
    	while (*curr == ' ' || *curr == 10 || *curr == 9) {
    		curr++;
    		if (*curr == END_OF_FILE) {
    			return -1;
    		}
    	}
    	/* name or keyword on first is a-z */
    	a_length=0;
    	if (*curr >= 'a' && *curr <= 'z') {
    		IDAndKey:
    		a_length = 0;
    		do {
    			a[a_length++] = *curr++;
    		} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
    				|| *curr == '_');
    		a[a_length] = '';
    		i = 0;
    		flag = 0;
    		if (*a - 'a' <= 26 && *a - 'a' >= 0) {
    			while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
    				if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
    					flag = 1;
    					break;
    				}
    				i++;
    			}
    			if (flag == 1) {
    				printf("keyword is %s
    ", a);
    				return 1;
    			} else {
    				printf("Identify is %s
    ", a);
    				return 1;
    			}
    		} else {
    			printf("Identify is %s
    ", a);
    			return 1;
    		}
    	} else if (isUpperLetter(*curr)) {
    		goto IDAndKey;
    	} else if (isDigit(*curr)) {
    		a_length = 0;
    		do {
    			a[a_length++] = *curr++;
    		} while (isDigit(*curr));
    		//float number
    		if (*curr == '.') {
    			do {
    				a[a_length++] = *curr++;
    			} while (isDigit(*curr));
    			a[a_length] = '';
    			printf("float number is %s
    ", a);
    			return 1;
    		} else {
    			// number
    			a[a_length] = '';
    			printf("number is %s
    ", a);
    			return 1;
    		}
    	/*
    	 * Operator begin
    	 * */
    	} else if (*curr == '<') {
    		a[a_length++] = *curr++;
    		if (*curr == '<') {
    			a[a_length++] = *curr++;
    		lastOperatorDeal:
    			a[a_length] = '';
    			printf("Operator is %s
    ", a);
    			return 1;
    		} else if (*curr == '=') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else {
    			goto lastOperatorDeal;
    		}
    	} else if (*curr == '>') {
    		a[a_length++] = *curr++;
    		if (*curr == '>') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else if (*curr == '=') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else {
    			goto lastOperatorDeal;
    		}
    
    	} else if (*curr == '=') {
    		a[a_length++] = *curr++;
    		if (*curr == '=') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else {
    			goto lastOperatorDeal;
    		}
    	} else if (*curr == '(') {
    	    singleOperator:
    		a[a_length++] = *curr++;
    		goto lastOperatorDeal;
    	} else if (*curr == ')') {
    		goto singleOperator;
    	} else if (*curr == '[') {
    		goto singleOperator;
    	} else if (*curr == ']') {
    		goto singleOperator;
    	} else if (*curr == '-') {
    		a[a_length++] = *curr++;
    		if (*curr == '>') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else if (*curr == '-') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else if (*curr == '=') {
    			a[a_length++] = *curr++;
    			goto lastOperatorDeal;
    		} else {
    			goto lastOperatorDeal;
    		}
    	}else if(*curr=='.'){
    		goto singleOperator;
    	}else if(*curr=='!'){
    		a[a_length++]=*curr++;
    		if(*curr=='='){
    			goto singleOperator;
    		}else{
    			goto lastOperatorDeal;
    		}
    	}else if(*curr=='~'){
    		goto singleOperator;
    	}else if(*curr=='+'){
            a[a_length++]=*curr++;
            if(*curr=='+'){
            	goto singleOperator;
            }else if(*curr=='='){
            	goto singleOperator;
            }else {
            	goto lastOperatorDeal;
            }
    	}else if(*curr=='-'){
            a[a_length++]=*curr++;
            if(*curr=='-'){
            	goto singleOperator;
            }else if(*curr=='='){
            	goto singleOperator;
            }else {
            	goto lastOperatorDeal;
            }
    	}else if(*curr=='*'){
            a[a_length++]=*curr++;
            if(*curr=='='){
            	goto singleOperator;
            }else{
                goto lastOperatorDeal;
            }
    	}else if(*curr=='&'){
    		a[a_length++]=*curr++;
    		if(*curr=='&'){
    			goto singleOperator;
    		}else if(*curr=='='){
    			goto singleOperator;
    		}else{
    			goto lastOperatorDeal;
    		}
    	}else if(*curr=='/'){
    		a[a_length++]=*curr++;
    	    if(*curr=='='){
    	    	goto singleOperator;
    	    }if(*curr=='/'){
            	// skip line
            	while(*curr!='
    '){
            		if(*curr==END_OF_FILE)
            			return -1;
            		curr++;
            	}
            }else if(*curr=='*'){
            	curr++;
            	// skip "/**/"
                while(*curr!=END_OF_FILE)
                {
                	if(*curr=='*' && *(curr+1)=='/'){
                		curr+=2;
                		break;
                	}
                    curr++;
                }
            }else{
            	goto lastOperatorDeal;
            }
    	}else if(*curr=='%'){
    		a[a_length++]=*curr++;
    		if(*curr=='d'){
    			goto singleOperator;
    		}else if(*curr=='c'){
    			goto singleOperator;
    		}else if(*curr=='f'){
    			goto singleOperator;
    		}else if(*curr=='l'){
    			a[a_length++]=*curr++;
    			if(*curr=='d')
    				goto singleOperator;
    			else if(*curr=='f')
    				goto singleOperator;
    			else
    				goto singleOperator;
    		}
    
    	}else if(*curr=='^'){
    		a[a_length++]=*curr++;
    	    if(*curr=='='){
    	    	goto singleOperator;
    	    }else{
    	    	goto lastOperatorDeal;
    	    }
    	}else if(*curr=='|'){
    		a[a_length++]=*curr++;
    		if(*curr=='|'){
    			goto singleOperator;
    		}else if(*curr=='='){
    			goto singleOperator;
    		}else{
    			goto lastOperatorDeal;
    		}
    	}else if(*curr=='?'){
            goto singleOperator;
    	}else if(*curr==':'){
            goto singleOperator;
    	}else if(*curr==','){
    		goto singleOperator;
    	}else if(*curr=='\'){
    		a[a_length++]=*curr++;
    		if(*curr=='n'){
    			goto singleOperator;
    		}else {
    			goto lastOperatorDeal;
    		}
    
    	}
    	/*
    	 * Operator end
    	 * */
    	/*
    	 * delimiter begin
    	 * */
    	else if(*curr=='{'){
    		singleDelimiter:
    		a[a_length++]=*curr++;
    		a[a_length]='';
    		printf("Delimiter is %s
    ", a);
    		return 1;
    	}else if(*curr=='}'){
            goto singleDelimiter;
    	}else if(*curr==';'){
    		goto singleDelimiter;
    	}else if(*curr=='''){
    		goto singleDelimiter;
    	}else if(*curr=='"'){
    		goto singleDelimiter;
    	}
    }
    

    这里实现了将单词分成五类流,并将单词打印出来,在后面的语法分析中将会使用到这里的单词流结果。

    忘了说了,我将自己写的编译器命名为:ZCC,头文件都包括在zcc.h中(*^__^*) 嘻嘻……,想写个类似与gcc 一样奇妙的玩意。

    最后看測试文档:


    struct  Student{
       int a;
       char* name;
    }
    
    int main()
    {
        int a=123;
        float a2=1.2345677;
        int b=1+3;
        for(int i=0; i < 100; i++)
        		a+=i;
        printf("%d
    ", a);
        return 0;
    }
    



    測试结果:


    keyword is struct
    Identify is Student
    Delimiter is {
    keyword is int
    Identify is a
    Delimiter is ;
    keyword is char
    Operator is *
    Identify is name
    Delimiter is ;
    Delimiter is }
    keyword is int
    Identify is main
    Operator is (
    Operator is )
    Delimiter is {
    keyword is int
    Identify is a
    Operator is =
    number is 123
    Delimiter is ;
    keyword is float
    Identify is a2
    Operator is =
    float number is 1.2345677
    Delimiter is ;
    keyword is int
    Identify is b
    Operator is =
    number is 1
    Operator is +
    number is 3
    Delimiter is ;
    keyword is for
    Operator is (
    keyword is int
    Identify is i
    Operator is =
    number is 0
    Delimiter is ;
    Identify is i
    Operator is <
    number is 100
    Delimiter is ;
    Identify is i
    Operator is ++
    Operator is )
    Identify is a
    Operator is +=
    Identify is i
    Delimiter is ;
    Identify is printf
    Operator is (
    Delimiter is "
    Operator is %d
    Operator is 
    
    Delimiter is "
    Operator is ,
    Identify is a
    Operator is )
    Delimiter is ;
    keyword is return
    number is 0
    Delimiter is ;
    Delimiter is }
    


    做到这里,能够告一小段落了,接下来做的事情就是语法分析。

  • 相关阅读:
    mac 外接显示屏的坑
    ssh 多秘钥管理和坑
    CircleCI 前端自动部署
    jest 事件测试
    await Vue.nextTick() 的含义分析
    Jest 里面需要注意的几个小细节
    element 库 date-picker 的 disabledDate 的坑
    jest 提示 Unexpected identifier 的解决方案
    preventDefault 和 stopPropagation
    数据库:Flask-SQLAlchemy
  • 原文地址:https://www.cnblogs.com/mengfanrong/p/3843400.html
Copyright © 2011-2022 走看看