Lexer（Ver. Hand）

本文介绍如何手撕词法分析器，整体来说难度不大，还是文件读取麻烦。

lexer.h

#ifndef LEXER_H
#define LEXER_H#include <iostream>
#include <fstream>
#include <vector>
#include <string>using namespace std;enum token_type
{INT,VOID,CONST,IF,ELSE,WHILE,BREAK,CONTINUE,RETURN,ID,NUM,ASSIGN,OR,AND,CMP,LPARENT,RPARENT,LBRACKET,RBRACKET,LBRACE,RBRACE,SEMICN,COMMA,BINOPP,PLUSSUB,NOT
};vector<string> token_name =
{"INT\t\t","VOID\t","CONST\t","IF\t\t","ELSE\t","WHILE\t","BREAK\t","CONTINUE","RETURN\t","ID\t\t","NUM\t\t","ASSIGN\t","OR\t\t","AND\t\t","CMP\t\t","LPARENT\t","RPARENT\t","LBRACKET","RBRACKET","LBRACE\t","RBRACE\t","SEMICN\t","COMMA\t","BINOPP\t","PLUSSUB\t","NOT\t\t"
};int lineno = 1;
vector<pair<token_type, string>> tokens;
string token_str;bool isHexDigit(char ch)    //返回是否是16进制的字符，即0~9,a~f,A~F
{if (ch >= '0' && ch <= '9') return true;if (ch >= 'a' && ch <= 'f') return true;if (ch >= 'A' && ch <= 'F') return true;return false;
}bool isIdentifierChar(char ch)    //返回是否是组成标识符的字符
{return isalpha(ch) || isdigit(ch) || ch == '_';
}// token identifiers
bool Number(ifstream &fin)
{char ch;//代表是否是非法十六进制、八进制、十进制数bool isInvalidHex = 0, isInvalidOct = 0, isInvalidDec = 0;//第一位的处理，根据0与非0分别考虑if (token_str == "0")            //OCT or HEX or ZERO{int decnum = 0;if (fin.peek() == 'x' || fin.peek() == 'X')    //HEX{fin.get(ch);token_str += ch;bool is0x = 1;     //判断是否是单个的0x，能进入下面的while就证明不是while (isHexDigit(fin.peek())){is0x = 0;fin.get(ch);token_str += ch;decnum <<= 4;if (isdigit(ch)){decnum += ch - '0';}else if (ch >= 'a' && ch <= 'f'){decnum += ch - 'a' + 10;}else{decnum += ch - 'A' + 10;}}if (isIdentifierChar(fin.peek()) || is0x)//混入了不是16进制的其它字母{isInvalidHex = 1;}else{token_str = to_string(decnum);}}else if (fin.peek() >= '1' && fin.peek() <= '7') //OCT{while (fin.peek() >= '0' && fin.peek() <= '7'){fin.get(ch);token_str += ch;decnum <<= 3;decnum += ch - '0';}if (isIdentifierChar(fin.peek()))         //混入了8以上的数字或字母{isInvalidOct = 1;}else{token_str = to_string(decnum);}}else if (isIdentifierChar(fin.peek()))        //invalid oct{isInvalidOct = 1;}else                                        //ZERO{token_str = "0";}}else                            //DEC{while (isdigit(fin.peek())){fin.get(ch);token_str += ch;}if (isIdentifierChar(fin.peek()))    //混入了字母{isInvalidDec = 1;}}if (isInvalidHex || isInvalidOct || isInvalidDec) //是非法数字，就把剩下的字母数字读完{while (isIdentifierChar(fin.peek())){fin.get(ch);token_str += ch;}if (isInvalidHex) printf("\033[1;31mInvalid Hex number at Line %d: %s\033[0m\n", lineno, token_str.c_str());if (isInvalidOct) printf("\033[1;31mInvalid Oct number at Line %d: %s\033[0m\n", lineno, token_str.c_str());if (isInvalidDec) printf("\033[1;31mInvalid Dec number at Line %d: %s\033[0m\n", lineno, token_str.c_str());return false;}else                                               //是合法数字，正常返回{tokens.emplace_back(token_type::NUM, token_str);return true;}
}bool Identifier(ifstream &fin)
{char ch;string st;st = token_str;while (ch = fin.peek(), (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch >= '0' && ch <= '9'){fin.get(ch);st += ch;}if (st == "int"){tokens.emplace_back(INT, st);}else if (st == "void"){tokens.emplace_back(VOID, st);}else if (st == "const"){tokens.emplace_back(CONST, st);}else if (st == "if"){tokens.emplace_back(IF, st);}else if (st == "else"){tokens.emplace_back(ELSE, st);}else if (st == "while"){tokens.emplace_back(WHILE, st);}else if (st == "break"){tokens.emplace_back(BREAK, st);}else if (st == "continue"){tokens.emplace_back(CONTINUE, st);}else if (st == "return"){tokens.emplace_back(RETURN, st);}else{tokens.emplace_back(ID, st);}return 1;
}bool Annotation(ifstream &fin)
{char ch;if (token_str == "/"){if (fin.peek() == '/'){while (fin.peek() != EOF){fin.get(ch);if (ch == '\n'){++lineno;break;}}if (ch == '\n' || fin.peek() == EOF) return true;else return false;}else if (fin.peek() == '*'){int lineno2 = lineno;while (fin.peek() != EOF){fin.get(ch);if (ch == '\n') ++lineno;if (ch == '*' && fin.peek() == '/') break;}if (ch == '*' && fin.peek() == '/'){fin.get(ch);return true;}else{printf("\033[1;31mUnterminated %s comment at line %d.\033[0m\n", "/*", lineno2);return false;}}else return false;}else return false;
}bool And(ifstream &fin)
{tokens.emplace_back(AND, "&&");if (fin.peek() == '&')return true;else{printf("\033[1;31mInvalid token \"&\" at Line %d: Do you mean \"&&\" ?\033[0m\n", lineno);return false;}
}bool Or(ifstream &fin)
{tokens.emplace_back(OR, "||");if (fin.peek() == '|')return true;else{printf("\033[1;31mInvalid token \"|\" at Line %d: Do you mean \"||\" ?\033[0m\n", lineno);return false;}
}#endif //LEXER_H

main.cpp

p#include "lexer.h"int main()
{// read fileifstream fin;fin.open("../sample.c");if (!fin){cout << "File not found" << endl;return 0;}char ch;while (fin.get(ch)){token_str = move(string(1, ch));switch (ch){case '\n':++lineno;continue;case ' ':case '\t':continue;case '{':tokens.emplace_back(token_type::LBRACE, "{");continue;case '}':tokens.emplace_back(token_type::RBRACE, "}");continue;case '(':tokens.emplace_back(token_type::LPARENT, "(");continue;case ')':tokens.emplace_back(token_type::RPARENT, ")");continue;case '[':tokens.emplace_back(token_type::LBRACKET, "[");continue;case ']':tokens.emplace_back(token_type::RBRACKET, "]");continue;case ';':tokens.emplace_back(token_type::SEMICN, ";");continue;case ',':tokens.emplace_back(token_type::COMMA, ",");continue;case '+':case '-':tokens.emplace_back(token_type::PLUSSUB, string(1, ch));continue;case '*':case '%':tokens.emplace_back(token_type::BINOPP, string(1, ch));continue;case '/':if (fin.peek() == '*' || fin.peek() == '/')Annotation(fin);elsetokens.emplace_back(token_type::BINOPP, string(1, ch));continue;case '!':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, "!=");}elsetokens.emplace_back(token_type::NOT, "!");continue;case '&':And(fin);continue;case '|':Or(fin);continue;case '=':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, "==");}elsetokens.emplace_back(token_type::ASSIGN, "=");continue;case '>':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, ">=");}elsetokens.emplace_back(token_type::CMP, ">");continue;case '<':if (fin.peek() == '='){fin.get(ch);tokens.emplace_back(token_type::CMP, "<=");}elsetokens.emplace_back(token_type::CMP, "<");continue;default:if (isdigit(ch))Number(fin);else if (isalpha(ch) || ch == '_')Identifier(fin);else if (!Annotation(fin))printf("\033[1;31mUnexpected character \"%c\" at Line %d.\033[0m\n", ch, lineno);}}for (auto &t: tokens)cout << token_name[t.first] << "\t\t" << t.second << endl;
}

Tips

fin是一个文件输入流，我们从中读取需要识别的程序代码
顶层逻辑是读取当前文件指针指向的字符ch，根据ch用switch分配到各个token识别子函数，用子函数读取一个完整的token(一两个字符的token就不设置函数了)
单个和双个字符的token识别已经在顶层中完成，即’\n’、‘\t’、空格、所有括号、逗号分号、所有运算符和比较符，可以参照这部分代码
token的数据结构是 vector<pair<token_type,string>>·，其中token_type是一个枚举类型，表示该token的类型；string则是存储该token的内容，一般为其字面值
用tokens.emplace_back(TYPE,Str)添加token，参见switch中的语句
每个token识别函数都会返回一个bool·值来表明该类型的token是否识别成功，如果识别成功则返回true，并需要将相应的token pair添加到tokens；若识别失败则返回false，并给出相应的错误提示.
错误提示规范：

printf(“\033[1;31m【Error Type】 at Line %d：【info】\033[0m\n”,lineno);

【Error Type】是错误类型；【info】是附加信息，可以省略
注意如果识别失败，将有两种策略
1. 将文件读指针退回去：比如说在识别关键字token的函数Keyword(fin)中发现这其实是一个变量名，那么我们需要将文件指针退回到调用函数之前
2. 将错就错：对于一些很好理解的错误，我们可以直接帮他进行修复，比如说读入一个’&’结果发现下一个字符不是’&’，很明显是用户少打了，那么我们依然可以采取tokens.emplace_back(AND, "&&");，并给出错误提示“Do you mean “&&” ?”(And和Or函数采用的就是这个策略)
token识别函数可以参照 bool And(ifstream &fin)和bool Or(ifstream &fin)
需要在本地运行时，只需在main.cpp同目录下创建sample.c即可
每个token识别函数的形参都是文件输入流变量fin的引用，主要使用fin的以下三个函数

函数	作用
fin.get(ch)	获取文件的下一个字符给ch，并将文件读指针往下移动一位
ch=fin.peek()	获取文件的下一个字符给ch，但不移动文件读指针(peek v. 窥视；窥见)
fin.seekg(-k, ios::cur)	将文件读指针回退k位，一般用于读入一连串字符后发现识别失败，那么需要将文件指针退回去

几个全局变量及其含义

全局变量	含义
token_name	通过token_name[token_type] 可以获取token_type的字面字符串(因为枚举类型的本质是整数)
lineno	当前文件读指针的行数，报错时需要显示
tokens	存取所有token pair的vector
token_str	存取当前token的字符串值，一方面根据该变量的长度可以确定回退位数，另一方面作为token的内容 ⚠️读入的第一个字符会存在这个变量中

实现的token识别函数

函数名	识别对象	备注
Number	八进制、十进制、十六进制	token.emplace_back(token_type::NUM,转成十进制数值后的字符串) 需要具备识别非法数值的能力
Identifier	变量名	token.emplace_back(token_type::ID,变量名) ⚠️重复的变量名不添加
Keyword	关键字	token.emplace_back(token_type::关键字相应的TYPE,关键字字符串)
Annotation	注释	无需生成token，两类注释： 1.//···\n 2./···/

Sample.c

int main()
{int abc = 0123;const 123abc;if(xxx != yyy){_do_something(); //wow}else if(xxx / yyy == zzz){_hello_world_123() //haha}else{HAhaHa0000__ = 3;}void hello0 =0world;while(1){i = i-1;if(0+1==2)    break;else        continue;}/*qwertyuiop;HAHAHAHAH;aaaaaaaaaaaaaaaaaa;return 0;*/intvoidconstbreakwhilecontinueifelse;return 0xff; /**/
}
/** sdadsdda/*/