本文介绍如何手撕词法分析器,整体来说难度不大,还是文件读取麻烦。
lexer.h
#ifndef LEXER_H
#define LEXER_H#include <iostream>
#include <fstream>
#include <vector>
#include <string>using namespace std;enum token_type
{INT,VOID,CONST,IF,ELSE,WHILE,BREAK,CONTINUE,RETURN,ID,NUM,ASSIGN,OR,AND,CMP,LPARENT,RPARENT,LBRACKET,RBRACKET,LBRACE,RBRACE,SEMICN,COMMA,BINOPP,PLUSSUB,NOT
};vector<string> token_name =
{"INT\t\t","VOID\t","CONST\t","IF\t\t","ELSE\t","WHILE\t","BREAK\t","CONTINUE","RETURN\t","ID\t\t","NUM\t\t","ASSIGN\t","OR\t\t","AND\t\t","CMP\t\t","LPARENT\t","RPARENT\t","LBRACKET","RBRACKET","LBRACE\t","RBRACE\t","SEMICN\t","COMMA\t","BINOPP\t","PLUSSUB\t","NOT\t\t"
};int lineno = 1;
vector<pair<token_type, string>> tokens;
string token_str;bool isHexDigit(char ch) //返回是否是16进制的字符,即0~9,a~f,A~F
{if (ch >= '0' && ch <= '9') return true;if (ch >= 'a' && ch <= 'f') return true;if (ch >= 'A' && ch <= 'F') return true;return false;
}bool isIdentifierChar(char ch) //返回是否是组成标识符的字符
{return isalpha(ch) || isdigit(ch) || ch == '_';
}// token identifiers
bool Number(ifstream &fin)
{char ch;//代表是否是非法十六进制、八进制、十进制数bool isInvalidHex = 0, isInvalidOct = 0, isInvalidDec = 0;//第一位的处理,根据0与非0分别考虑if (token_str == "0") //OCT or HEX or ZERO{int decnum = 0;if (fin.peek() == 'x' || fin.peek() == 'X') //HEX{fin.get(ch);token_str += ch;bool is0x = 1; //判断是否是单个的0x,能进入下面的while就证明不是while (isHexDigit(fin.peek())){is0x = 0;fin.get(ch);token_str += ch;decnum <<= 4;if (isdigit(ch)){decnum += ch - '0';}else if (ch >= 'a' && ch <= 'f'){decnum += ch - 'a' + 10;}else{decnum += ch - 'A' + 10;}}if (isIdentifierChar(fin.peek()) || is0x)//混入了不是16进制的其它字母{isInvalidHex = 1;}else{token_str = to_string(decnum);}}else if (fin.peek() >= '1' && fin.peek() <= '7') //OCT{while (fin.peek() >= '0' && fin.peek() <= '7'){fin.get(ch);token_str += ch;decnum <<= 3;decnum += ch - '0';}if (isIdentifierChar(fin.peek())) //混入了8以上的数字或字母{isInvalidOct = 1;}else{token_str = to_string(decnum);}}else if (isIdentifierChar(fin.peek())) //invalid oct{isInvalidOct = 1;}else //ZERO{token_str = "0";}}else //DEC{while (isdigit(fin.peek())){fin.get(ch);token_str += ch;}if (isIdentifierChar(fin.peek())) //混入了字母{isInvalidDec = 1;}}if (isInvalidHex || isInvalidOct || isInvalidDec) //是非法数字,就把剩下的字母数字读完{while (isIdentifierChar(fin.peek())){fin.get(ch);token_str += ch;}if (isInvalidHex) printf("\033[1;31mInvalid Hex number at Line %d: %s\033[0m\n", lineno, token_str.c_str());if (isInvalidOct) printf("\033[1;31mInvalid Oct number at Line %d: %s\033[0m\n", lineno, token_str.c_str());if (isInvalidDec) printf("\033[1;31mInvalid Dec number at Line %d: %s\033[0m\n", lineno, token_str.c_str());return false;}else //是合法数字,正常返回{tokens.emplace_back(token_type::NUM, token_str);return true;}
}bool Identifier(ifstream &fin)
{char ch;string st;st = token_str;while (ch = fin.peek(), (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch >= '0' && ch <= '9'){fin.get(ch);st += ch;}if (st == "int"){tokens.emplace_back(INT, st);}else if (st == "void"){tokens.emplace_back(VOID, st);}else if (st == "const"){tokens.emplace_back(CONST, st);}else if (st == "if"){tokens.emplace_back(IF, st);}else if (st == "else"){tokens.emplace_back(ELSE, st);}else if (st == "while"){tokens.emplace_back(WHILE, st);}else if (st == "break"){tokens.emplace_back(BREAK, st);}else if (st == "continue"){tokens.emplace_back(CONTINUE, st);}else if (st == "return"){tokens.emplace_back(RETURN, st);}else{tokens.emplace_back(ID, st);}return 1;
}bool Annotation(ifstream &fin)
{char ch;if (token_str == "/"){if (fin.peek() == '/'){while (fin.peek() != EOF){fin.get(ch);if (ch == '\n'){++lineno;break;}}if (ch == '\n' || fin.peek() == EOF) return true;else return false;}else if (fin.peek() == '*'){int lineno2 = lineno;while (fin.peek() != EOF){fin.get(ch);if (ch == '\n') ++lineno;if (ch == '*' && fin.peek() == '/') break;}if (ch == '*' && fin.peek() == '/'){fin.get(ch);return true;}else{printf("\033[1;31mUnterminated %s comment at line %d.\033[0m\n", "/*", lineno2);return false;}}else return false;}else return false;
}bool And(ifstream &fin)
{tokens.emplace_back(AND, "&&");if (fin.peek() == '&')return true;else{printf("\033[1;31mInvalid token \"&\" at Line %d: Do you mean \"&&\" ?\033[0m\n", lineno);return false;}
}bool Or(ifstream &fin)
{tokens.emplace_back(OR, "||");if (fin.peek() == '|')return true;else{printf("\033[1;31mInvalid token \"|\" at Line %d: Do you mean \"||\" ?\033[0m\n", lineno);return false;}
}#endif //LEXER_H
main.cpp
p#include "lexer.h"int main()
{// read fileifstream fin;fin.open("../sample.c");if (!fin){cout << "File not found" << endl;return 0;}char ch;while (fin.get(ch)){token_str = move(string(1, ch));switch (ch){case '\n':++lineno;continue;case ' ':case '\t':continue;case '{':tokens.emplace_back(token_type::LBRACE, "{");continue;case '}':tokens.emplace_back(token_type::RBRACE, "}");continue;case '(':tokens.emplace_back(token_type::LPARENT, "(");continue;case ')':tokens.emplace_back(token_type::RPARENT, ")");continue;case '[':tokens.emplace_back(token_type::LBRACKET, "[");continue;case ']':tokens.emplace_back(token_type::RBRACKET, "]");continue;case ';':tokens.emplace_back(token_type::SEMICN, ";");continue;case ',':tokens.emplace_back(token_type::COMMA, ",");continue;case '+':case '-':tokens.emplace_back(token_type::PLUSSUB, string(1, ch));continue;case '*':case '%':tokens.emplace_back(token_type::BINOPP, string(1, ch));continue;case '/':if (fin.peek() == '*' || fin.peek() == '/')Annotation(fin);elsetokens.emplace_back(token_type::BINOPP, string(1, ch));continue;case '!':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, "!=");}elsetokens.emplace_back(token_type::NOT, "!");continue;case '&':And(fin);continue;case '|':Or(fin);continue;case '=':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, "==");}elsetokens.emplace_back(token_type::ASSIGN, "=");continue;case '>':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, ">=");}elsetokens.emplace_back(token_type::CMP, ">");continue;case '<':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, "<=");}elsetokens.emplace_back(token_type::CMP, "<");continue;default:if (isdigit(ch))Number(fin);else if (isalpha(ch) || ch == '_')Identifier(fin);else if (!Annotation(fin))printf("\033[1;31mUnexpected character \"%c\" at Line %d.\033[0m\n", ch, lineno);}}for (auto &t: tokens)cout << token_name[t.first] << "\t\t" << t.second << endl;
}
Tips
fin
是一个文件输入流,我们从中读取需要识别的程序代码顶层逻辑是读取当前文件指针指向的字符
ch
,根据ch
用switch
分配到各个token识别子函数,用子函数读取一个完整的token(一两个字符的token就不设置函数了)单个和双个字符的token识别已经在顶层中完成,即’\n’、‘\t’、空格、所有括号、逗号分号、所有运算符和比较符,可以参照这部分代码
token的数据结构是
vector<pair<token_type,string>>
·,其中token_type是一个枚举类型,表示该token的类型;string则是存储该token的内容,一般为其字面值用
tokens.emplace_back(TYPE,Str)
添加token,参见switch
中的语句每个token识别函数都会返回一个
bool
·值来表明该类型的token是否识别成功,如果识别成功则返回true
,并需要将相应的token pair添加到tokens
;若识别失败则返回false
,并给出相应的错误提示.错误提示规范:
printf(“\033[1;31m【Error Type】 at Line %d:【info】\033[0m\n”,lineno);
【Error Type】是错误类型;【info】是附加信息,可以省略
注意如果识别失败,将有两种策略
- 将文件读指针退回去:比如说在识别关键字token的函数
Keyword(fin)
中发现这其实是一个变量名,那么我们需要将文件指针退回到调用函数之前 - 将错就错:对于一些很好理解的错误,我们可以直接帮他进行修复,比如说读入一个’&’结果发现下一个字符不是’&’,很明显是用户少打了,那么我们依然可以采取
tokens.emplace_back(AND, "&&");
,并给出错误提示“Do you mean “&&” ?”(And
和Or
函数采用的就是这个策略)
- 将文件读指针退回去:比如说在识别关键字token的函数
token识别函数可以参照
bool And(ifstream &fin)
和bool Or(ifstream &fin)
需要在本地运行时,只需在main.cpp同目录下创建sample.c即可
每个token识别函数的形参都是文件输入流变量
fin
的引用,主要使用fin
的以下三个函数
函数 | 作用 |
---|---|
fin.get(ch) | 获取文件的下一个字符给ch,并将文件读指针往下移动一位 |
ch=fin.peek() | 获取文件的下一个字符给ch,但不移动文件读指针(peek v. 窥视;窥见) |
fin.seekg(-k, ios::cur) | 将文件读指针回退k位,一般用于读入一连串字符后发现识别失败,那么需要将文件指针退回去 |
- 几个全局变量及其含义
全局变量 | 含义 |
---|---|
token_name | 通过token_name[token_type] 可以获取token_type的字面字符串(因为枚举类型的本质是整数) |
lineno | 当前文件读指针的行数,报错时需要显示 |
tokens | 存取所有token pair的vector |
token_str | 存取当前token的字符串值,一方面根据该变量的长度可以确定回退位数,另一方面作为token的内容 ⚠️读入的第一个字符会存在这个变量中 |
- 实现的token识别函数
函数名 | 识别对象 | 备注 |
---|---|---|
Number | 八进制、十进制、十六进制 | token.emplace_back(token_type::NUM,转成十进制数值后的字符串) 需要具备识别非法数值的能力 |
Identifier | 变量名 | token.emplace_back(token_type::ID,变量名) ⚠️重复的变量名不添加 |
Keyword | 关键字 | token.emplace_back(token_type::关键字相应的TYPE,关键字字符串) |
Annotation | 注释 | 无需生成token,两类注释: 1.//···\n 2./*···*/ |
Sample.c
int main()
{int abc = 0123;const 123abc;if(xxx != yyy){_do_something(); //wow}else if(xxx / yyy == zzz){_hello_world_123() //haha}else{HAhaHa0000__ = 3;}void hello0 =0world;while(1){i = i-1;if(0+1==2) break;else continue;}/*qwertyuiop;HAHAHAHAH;aaaaaaaaaaaaaaaaaa;return 0;*/intvoidconstbreakwhilecontinueifelse;return 0xff; /**/
}
/** sdadsdda/*/