本文共 12434 字,大约阅读时间需要 41 分钟。
本节书摘来自华章社区《编译与反编译技术实战》一书中的第3章,第3.2节词法分析器的手工实现,作者刘晓楠 陶红伟 岳峰 戴超,更多章节内容可以访问云栖社区“华章社区”公众号查看
3.2 词法分析器的手工实现
手工构造词法分析器首先需要将描述单词符号的正规文法或者正规式转化为状态转换图,然后再依据状态转换图进行词法分析器的构造。状态转换图是一个有限方向图,结点代表状态,用圆圈表示;状态之间用箭弧连接,箭弧上的标记(字符)代表射出结点状态下可能出现的输入字符或字符类。一张转换图只包含有限个状态,其中有一个为初态,至少要有一个终态(用双圈表示)。大多数程序语言的单词符号都可以用状态转换图予以识别。具体过程如下:1)从初始状态出发。2)读入一个字符。3)按当前字符转入下一状态。4)重复步骤2~3直到无法继续转移为止。在遇到读入的字符是单词的分界符时,若当前状态是终止状态,说明读入的字符组成了一个单词;否则,说明输入字符串不符合词法规则。这里我们以一个C语言子集作为例子,说明如何基于状态转换图手工编写词法分析器,该语言子集几乎包含了C语言所有的单词符号。主要步骤是,首先给出描述该子集中各种单词符号的词法规则,其次构造其状态转换图,然后根据状态转换图编写词法分析器。表3-1列出了这个C语言子集的所有单词符号以及它们的种别编码。该语言子集所包含的单词符号有:1)标识符:以字母、下划线开头的字母、数字和下划线组成的符号串。2)关键字:标识符的子集,C语言的关键字共有32个(为了测试加入了输入输出函数,共计34个)。3)无符号十进制整数:由0到9数字组成的字符串。4)算符和界符:“{”“}”“[”“]”“(”“)”“.”“->”“~”“++”“--”“!”“&”“*”“/”“%”“+”“-”“<<”“>>”“>”“>=”“<”“<=”“==”“!=”“^”“|”“&&”“||”“?”“=”“/=”“*=”“%=”“+=”“-=”“&=”“^=”“|=”“,”“#”“;”“:”,共计44个。表3-1 C语言子集的单词符号及种别编码单词符号 种别编码 单词符号 种别编码 单词符号 种别编码 单词符号 种别编码单词符号 种别编码 单词符号 种别编码 单词符号 种别编码 单词符号 种别编码void 1 long 7 typedef 13 const 19char 2 signed 8 sizeof 14 volatile 20int 3 unsigned 9 auto 15 return 21float 4 struct 10 static 16 continue 22double 5 union 11 register 17 break 23short 6 enum 12 extern 18 goto 24if 25 ( 39 << 53 /= 67else 26 ) 40 >> 54 *= 68switch 27 . 41 > 55 %= 69case 28 -> 42 >= 56 += 70default 29 ~ 43 < 57 -= 71for 30 ++ 44 <= 58 &= 72do 31 -- 45 = = 59 ^= 73while 32 ! 46 != 60 |= 74scanf 33 & 47 ^ 61 , 75printf 34 * 48 | 62 # 76{ 35 / 49 && 63 ; 77} 36 % 50 || 64 : 78[ 37 + 51 ? 65 标识符 79] 38 - 52 = 66 数字 80下面为产生该C语言子集中所涉及的单词符号的文法的产生式。1)关键字文法:keyword→ void | char | int | float | double | short | long | signed | unsigned | struct | union | enum | typedef | sizeof | auto | static | register | extern | const | volatile | return | continue | break | goto | if | else | switch | case | default | for | do | while | scanf | printf2)标识符文法:letter→A | … | Z | a | …| zdigit→0 | 1 | … | 9id→letter rid |- ridrid→letter rid |- rid |digit rid | ε3)无符号整数文法:digit→0 | 1 | … | 9digits→digit rdigitrdigit→digit rdigit |ε4)算符和界符的文法:op→{ | } | [ | ] | ( | ) | . | -bigger | ~ | +plus | -minus | ! | & | * | / | % | + | - |bigger | > | >equal | < | plus→+minus→-less→
依据上述文法可得到如图3-2所示的状态转换图。其中,终态上的星号(*)表示此时还要把超前读出的字符退回,即搜索指针回调一个字符位置。在状态2时,所识别出的标识符应先与表的前34项逐一比较,若匹配,则该标识符是一个保留字,否则就是标识符。如果是标识符,则返回相应的种别编码和标识符本身。在状态4时,将识别的常数种别编码和常数值返回。在状态7、12、16、19、23时,为了识别相应的算符需进行超前搜索。
状态转换图非常易于实现,最简单的方法是为每个状态编写一段程序。对于不含回路的分支状态来说,可以用一个switch()语句或一组if-else语句实现;对于含回路的状态来说,可以让它对应一个while语句。图3-3给出了整个词法分析器的程序设计流程图。
#include#include #include #include #include #include using namespace std;struct symbol{ char * str; int coding;};char *keyword_list[34] = { "void", "char", "int", "float", "double", "short", "long", "signed", "unsigned", "struct", "union", "enum", "typedef", "sizeof", "auto", "static", "register", "extern", "const", "volatile", "return", "continue", "break", "goto", "if", "else", "switch", "case","default","for","do","while","scanf","printf"};char *operator_list[44] = { "{","}","[","]","(",")",".","->","~","++","--","!","&","*","/","%","+","-","<<",">>",">", ">=","<","<=","==","!=","^","|","&&","||","?","=","/=","*=","%=","+=","-=","&=","^=","|=",",","#",";",":"};char ch; //读入的字符char strToken[20] = "";/ /读入的字符串int eof_flag = 0;int num = 1;//编码的数字(为了递增)int row = 1;struct symbol keywords[34];struct symbol identifiers[44];FILE *fp = NULL;FILE *fw = NULL;ofstream out;//给单词符号设定种别编码void initialization() { //给关键字设定种别编码 for (int i = 0;i < 34;i++) { keywords[i].str = keyword_list[i]; keywords[i].coding = num; num++; } //给算符和界符设定种别编码 for (i = 0;i < 44;i++) { identifiers[i].str = operator_list[i]; identifiers[i].coding = num; num++; } //数字79,标识符80}//把下一个字符读入ch中void getNextChar(FILE *ffp){ if ((ch = getc(ffp)) == EOF) { eof_flag = 1; } if (ch == '\n') row++;}//检查ch的字符是否为空白符、回车或者制表符,若是,则反复调用getNextChar (),直至ch中读入一非上述符号void getbc(FILE * ffp){ while (ch == ' ' || ch == '\n' || ch == '\t') { getNextChar(ffp); }}//判断ch是否为字母bool isLetter(char ch){ return isalpha(ch);}//判断ch是否为数字bool isDigit(char ch){ return isdigit(ch);}//判断ch是否为下划线bool isUnderline(char ch){ if (ch == '_') return 1; else return 0;}//将输入的字符ch连接到strTokenvoid concat(){ char * tmp = &ch; strcat(strToken, tmp);}//把搜索指针回调一个字符位置void retract(FILE * ffp){ fseek(ffp, -1, SEEK_CUR); ch = ' ';}//对于strToken中的字符串判断它是否为保留字,若它是保留字则给出它的编码,否则返回0int reserve_string(char * str) { for (int i = 0;i < 34;i++) { if ((strcmp(str, keywords[i].str)) == 0) { return keywords[i].coding; } } return 0;}//返回strToken中所识别出的算符和界符编码int reserve_operator(char* ch){ for (int i = 0;i < 44;i++) { if ((strcmp(ch, identifiers[i].str)) == 0) { return identifiers[i].coding; } } return 0;}//出错处理void error(){ printf("\n ********Error*********************\n"); printf(" row %d Invaild symbol ! ! ! ", row); printf("\n ********Error*********************\n"); exit(0);}//词法分析void LexiscalAnalyzer(){ int num = 0, val = 0, code = 0; strcpy(strToken, ""); getNextChar(fp); getbc(fp); switch (ch) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': while (isLetter(ch) || isDigit(ch) || isUnderline(ch)) { concat(); getNextChar(fp); } retract(fp); code = reserve_string(strToken); if (code = = 0) { printf("(%d , %s)\n", 79, strToken); } else { printf("(%d , %s)\n", code, strToken); } break; case'0': case'1': case'2': case'3': case'4': case'5': case'6': case'7': case'8': case'9': while (isdigit(ch)) { concat(); getNextChar(fp); } retract(fp); printf("(%d , %s)\n",80, strToken); break; case '{': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '}': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '[': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case ']': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '(': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case ')': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '.': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '-': concat(); getNextChar(fp); if (ch == '>') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '-') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '~': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '+': concat(); getNextChar(fp); if (ch == '+') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '*': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '&': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '&') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '!': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '%': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '<': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '<') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '>': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '>') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '=': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '^': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '|': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '|') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case '?': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '/': concat(); getNextChar(fp); if (ch == '=') { concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } else if (ch == '/') //跳过注释 { getNextChar(fp); while (ch != '\n') { getNextChar(fp); } break; } else if (ch == '*')//跳过注释 { getNextChar(fp); while (ch != '*') { getNextChar(fp); } getNextChar(fp); if (ch == '/'); break; } else { retract(fp); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); } break; case ',': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case '#': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case ';': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; case ':': concat(); code = reserve_operator(strToken); printf("(%d , %s)\n", code, strToken); break; default: if (ch == EOF) { eof_flag = 1; break; } error(); }}//主函数int main(){ initialization(); char name[1024]; cout<<"从文件读入:"; cin>>name; fp=fopen(name,"r"); out.open("result.txt"); while(!feof(fp)) { if (eof_flag == 1) { exit(1); } LexiscalAnalyzer(); } fclose(fp); out.close(); return 0;}
转载地址:http://rmaex.baihongyu.com/