Home コンピュータシステムの理論と実装 (10)
Post
Cancel

コンピュータシステムの理論と実装 (10)

前章までで中間コードをアセンブリに変換する部分は完成したので、本章からは後半部分である Jack 言語から中間コードへのコンパイラを書いていく。

10 章 コンパイラ#1: 構文解析

tokenizer を書く

コードを token ごとに区切り、xml として出力するコードを書けばよい。こんなん半角スペースで分割してそれぞれの要素が何であるかを判定していけばいいだけやんとか思っていたが、よく考えたらこれは誤り ("Hello world" は半角スペースを含むが一つの token であるし、a[0] は半角スペースを含まないが複数の token を含んでいる)。なので、行ごとに一文字ずつ走査していき、

  • " が発見されたら終わりが見つかるまで idx を進め、見つかったら STRING_CONST として token に加える
  • symbol や半角文字が見つかったらある token の終わりとみなし、まだ出力されていない文字列を token として出力

という操作が必要になる。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import re
import pathlib
import sys
import tokenDic


class TokenObject:
    def __init__(self, token, token_type) -> None:
        self.token: str = token
        self.token_type: str = token_type
        self.before: 'list[str]' = []
        self.after: 'list[str]' = []

    def format(self) -> str:
        substitute_symbol_dic: str[str] = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
        temp_token: str = self.token
        if self.token in substitute_symbol_dic.keys():
            temp_token = substitute_symbol_dic[self.token]
        substitute_type_dic: str[str] = {'string_const': 'stringConstant', 'int_const': 'integerConstant'}
        temp_token_type = self.token_type.lower()
        if temp_token_type in substitute_type_dic.keys():
            temp_token_type = substitute_type_dic[temp_token_type]
        output = f'<{temp_token_type}> {temp_token} </{temp_token_type}>'
        if len(self.before) != 0:
            output = '\n'.join(self.before) + '\n' + output
        if len(self.after) != 0:
            output += '\n' + '\n'.join(self.after)
        return output


class JackTokenizer:
    def __init__(self, p_file: str) -> None:
        self.p_file = p_file
        with open(p_file) as f:
            s: str = f.read()
        s = re.sub(r'(//.*\n|/\*.(.|\s)*?\*/)', '\n', s)  # delete comments
        s = re.sub(r' *\n', '\n', s)  # delete spaces at the end of lines
        self.lines: list[str] = s.split('\n')
        self.lines = [line for line in self.lines if not re.match(r'^\s*$', line)]
        self.current_line_num: int = 0
        self.line_num: int = len(self.lines)
        self.output: str = '<tokens>\n'
        self.token_objects: list[TokenObject] = []

    def tokenizeLine(self, line: str) -> 'list[TokenObject]':
        token_objects: list[TokenObject] = []
        token: str = ''
        token_type: str = ''
        len_of_list: int = len(line)
        idx: int = 0
        beg_idx: int = 0
        while idx < len_of_list:
            if line[idx] == '"':
                beg_idx = idx + 1
                idx += 1
                while idx < len_of_list:
                    if line[idx] == '"':
                        token_objects.append(TokenObject(line[beg_idx:idx], 'STRING_CONST'))
                        beg_idx = idx + 1
                        break
                    idx += 1
            elif line[idx] in tokenDic.symbols:
                if beg_idx != idx:
                    token = line[beg_idx:idx]
                    token_type = self.tokenType(token)
                    token_objects.append(TokenObject(token, token_type))
                token_objects.append(TokenObject(line[idx], 'SYMBOL'))
                beg_idx = idx + 1
            elif line[idx] == ' ' or line[idx] == '\t':
                if beg_idx != idx:
                    token = line[beg_idx:idx]
                    token_type = self.tokenType(token)
                    token_objects.append(TokenObject(token, token_type))
                beg_idx = idx + 1
            idx += 1
        return token_objects

    def tokenType(self, token) -> str:
        """
        Determine the type of a given token
        """
        if token in tokenDic.keywords:
            return 'KEYWORD'
        elif token in tokenDic.symbols:
            return 'SYMBOL'
        elif token.isdecimal():
            if int(token) <= 32767:
                return 'INT_CONST'
            else:
                SyntaxError('Value more than 32767 is not accepted')
        else:
            if token[0].isdecimal():
                SyntaxError('Identifier starts with a number is not accepted')
            else:
                return 'IDENTIFIER'

    def hasMoreLine(self) -> bool:
        return True if self.current_line_num < self.line_num else False

    def advance(self) -> bool:
        print(self.current_line_num)
        line: str = self.lines[self.current_line_num]
        token_objects = self.tokenizeLine(line)
        self.output += '\n'.join([token_object.format() for token_object in token_objects]) + '\n'
        self.token_objects.extend(token_objects)
        self.current_line_num += 1

    def saveToFile(self) -> None:
        self.output += '</tokens>\n'
        self.path_w = f'{self.p_file.parent}/{self.p_file.stem}T.xml'
        with open(self.path_w, mode='w') as f:
            f.write(self.output)


if __name__ == '__main__':
    path: str = sys.argv[1]
    p_path: pathlib.Path = pathlib.Path(path)
    p_file_list: 'list[pathlib.Path]' = []
    if p_path.is_dir():
        p_file_list = list(p_path.glob('**/*.jack'))
    else:
        p_file_list = [p_path]

    for p_file in p_file_list:
        tokenizer = JackTokenizer(p_file)
        while tokenizer.hasMoreLine():
            tokenizer.advance()
        tokenizer.saveToFile()

構文解析

構文木を構築するため、再帰下降構文解析を行う (この言葉かっこよくて好きだ)。最初のトークンだけからトークンの種類を決定することができる文法を LL(1) といい、Jack 言語は概ねこれに従っている。例えば他の言語だと返り値無しの関数実行は a.run() とかになるが、Jack では do a.run() などと明示することによって、先読みすることなくこれが SubroutineCall であることが分かるようになっている。

パーサを作るにあたっての留意点は括弧の処理である。例えば while 文 'while' '(' expression ')' '{' statements '}' という形で表されるが、expression や statements の中にも当然括弧が入りうるので、対応するものを正しく見つける必要がある。今回は、0 で初期化した変数を用意し、文字列を先頭から走査しつつ ( を見つけたら +1、) を見つけたら -1 していき、変数が 0 になったところを閉じ括弧とみなす方法を採用した。AtCoder の茶 diff くらいにありそうな問題だ。

1
2
3
4
5
6
7
8
9
10
11
12
def findClosingBracket(self, start: int, end: int, open_c: str, close_c: str) -> int:
    bracket_cnt: int = 0
    idx: int = start
    while idx <= end:
        if self.token_objects[idx].token == open_c:
            bracket_cnt += 1
        elif self.token_objects[idx].token == close_c:
            bracket_cnt -= 1
            if bracket_cnt == 0:
                return idx
        idx += 1
    return -1

基本的に p233-234 の仕様通りに書いていけばよいが、expression (式) の扱いでやや混乱した。expression は term (op term)* という形で表され、要は term が op を介して組み合わさっているという形 (具体的には、1 + 1 とか sum + a[0] + f(3) とか) なので、オペレータを見つけ次第 term として分割し、そのあとの文字列を再帰で処理することになる。

全体的にゴリゴリの再帰を書くことになり結構疲れた。なんやかんやできたので達成感はある。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import pathlib
import sys
from tokenizer import TokenObject, JackTokenizer


class CompilationEngine:
    def __init__(self, token_objects: 'list[TokenObject]', p_file: str) -> None:
        self.token_objects = token_objects
        self.p_file = p_file
        self.statement_keyword_dic = {
            'let': self.compileLet,
            'if': self.compileIf,
            'while': self.compileWhile,
            'do': self.compileDo,
            'return': self.compileReturn}
        self.output = ''
        pass

    def compile(self):
        print(len(self.token_objects) - 1)
        self.compileClasses(0, len(self.token_objects) - 1)

    def compileClasses(self, start: int, end_script: int):
        """
        Compile classes (whole script) recursively
        """
        if start >= end_script:
            return
        closing_class_idx = self.compileClass(start, end_script)
        print(self.token_objects[closing_class_idx].token)
        return self.compileClasses(closing_class_idx + 1, end_script)

    def compileClass(self, start: int, end_script: int):
        """
        'class' className '{' classVarDec* subroutineDec* '}'
        ex) class Bar {...}
        """
        self.token_objects[start].before.append('<class>')
        print(self.token_objects[start + 1].token)
        closing_class_body_idx: int = self.findClosingBracket(start + 2, end_script, '{', '}')
        self.compileClassBody(start + 3, closing_class_body_idx - 1)
        self.token_objects[closing_class_body_idx].after.append('</class>')
        return closing_class_body_idx

    def compileClassBody(self, start: int, end_class_body: int):
        """
        classVarDec* subroutineDec*
        'static int a; method int hoge (...) {...}'
        """
        if start >= end_class_body:
            return
        first_token: str = self.token_objects[start].token
        if first_token in ['static', 'field']:
            end_idx: int = self.compileClassVarDec(start, end_class_body)
        else:
            end_idx: int = self.compileSubroutineDec(start, end_class_body)
        return self.compileClassBody(end_idx + 1, end_class_body)

    def compileClassVarDec(self, start: int, end_class_body: int) -> int:
        """
        ('static' | 'field') type varName (',' varName) * ';'
        ex) 'static int a;'
        """
        self.token_objects[start].before.append('<classVarDec>')
        print(self.token_objects[start + 1].token)
        idx: int = start
        while idx <= end_class_body:
            if self.token_objects[idx].token == ';':
                break
            idx += 1
        self.token_objects[idx].after.append('</classVarDec>')
        return idx

    def compileSubroutineDec(self, start: int, end_class_body: int) -> int:
        """
        ('constructor' | 'function' | 'method') ('void' | type) subroutineName '(' parameterList ')' '{' subroutineBody '}'
        ex) 'method int hoge (...) {...}'
        """
        self.token_objects[start].before.append('<subroutineDec>')
        print(self.token_objects[start + 2].token)
        # compile '(' parameterList ')'
        closing_param_list_idx: int = self.findClosingBracket(start + 3, end_class_body, '(', ')')
        self.compileParameterList(start + 4, closing_param_list_idx - 1)
        # compile '{' subroutineBody '}'
        self.token_objects[closing_param_list_idx + 1].before.append('<subroutineBody>')
        closing_subr_body_idx: int = self.findClosingBracket(closing_param_list_idx + 1, end_class_body, '{', '}')
        self.token_objects[closing_subr_body_idx].after.append('</subroutineBody>')
        self.compileSubroutineBody(closing_param_list_idx + 2, closing_subr_body_idx - 1)
        self.token_objects[closing_subr_body_idx].after.append('</subroutineDec>')
        return closing_subr_body_idx

    def compileSubroutineBody(self, start: int, end_body: int) -> None:
        """
        varDec* statements
        ex) 'var int a; let a = 1; return a;'
        """
        if start >= end_body:
            return
        first_token: str = self.token_objects[start].token
        if first_token == 'var':
            end_idx: int = self.compileVarDec(start, end_body)
            return self.compileSubroutineBody(end_idx + 1, end_body)
        else:
            self.token_objects[start].before.append('<statements>')
            end_idx: int = self.compileStatements(start, end_body)
            self.token_objects[end_body].after.append('</statements>')
            return

    def compileSubroutineCall(self, start: int, end: int) -> None:
        """
        subroutineName '(' expressionList ')' |
        (className | varName) '.' subroutineName '(' expressionList ')'
        ex) 'run()', 'a.run()'
        """
        # self.token_objects[start].before.append('<term>')
        idx: int = start
        while self.token_objects[idx].token != '(':
            idx += 1
        self.token_objects[idx].after.append('<expressionList>')
        closing_expression_list = self.findClosingBracket(idx, end, '(', ')')
        self.compileExpressionList(idx + 1, closing_expression_list - 1)
        self.token_objects[closing_expression_list].before.append('</expressionList>')
        # self.token_objects[end].after.append('</term>')

    def compileExpressionList(self, start: int, end: int) -> None:
        """
        (expression (',' expression)*)?
        ex) '(a + 1, b)'
        """
        if start > end:
            return
        idx: int = start
        exp_start_idx: int = start
        while idx <= end:
            if self.token_objects[idx].token == ',':
                self.compileExpression(exp_start_idx, idx - 1)
                exp_start_idx = idx + 1
            idx += 1
        self.compileExpression(exp_start_idx, end)

    def compileParameterList(self, start: int, end: int):
        """
        ((type varName) (',' type varName)*)?
        ex) 'int a, char c', ''
        """
        if start > end:
            self.token_objects[end].after.append('<parameterList>')
            self.token_objects[start].before.append('</parameterList>')
        else:
            self.token_objects[start].before.append('<parameterList>')
            self.token_objects[end].after.append('</parameterList>')

    def compileVarDec(self, start: int, end_body: int) -> int:
        """
        'var' type varName (',' varname)* ';'
        ex) 'int a, b;';
        """
        self.token_objects[start].before.append('<varDec>')
        idx: int = start
        while idx <= end_body:
            if self.token_objects[idx].token == ';':
                break
            idx += 1
        self.token_objects[idx].after.append('</varDec>')
        return idx

    def compileStatements(self, start: int, end_statements: int) -> None:
        if start >= end_statements:
            return
        first_token: str = self.token_objects[start].token
        end_statement: int = self.statement_keyword_dic[first_token](start, end_statements)
        self.compileStatements(end_statement + 1, end_statements)

    def compileDo(self, start: int, end_statements: int) -> int:
        """
        'do' subroutineCall ';'
        """
        self.token_objects[start].before.append('<doStatement>')
        idx: int = start
        while idx <= end_statements:
            if self.token_objects[idx].token == ';':
                break
            idx += 1
        self.token_objects[idx].after.append('</doStatement>')
        self.compileSubroutineCall(start + 1, idx - 1)
        return idx

    def compileLet(self, start: int, end_statements: int) -> int:
        """
        'let' varname ('[' expression ']')? '=' expression ';'
        ex) 'let a = 5;', 'let a[4] = b + 3;'
        """
        self.token_objects[start].before.append('<letStatement>')
        right_operand_start_idx: int = None
        if self.token_objects[start + 2].token == '[':  # if '[' expression ']'
            closing_expression_idx: int = self.findClosingBracket(start + 2, end_statements, '[', ']')
            self.compileExpression(start + 3, closing_expression_idx - 1)
            right_operand_start_idx = closing_expression_idx + 2
        else:
            right_operand_start_idx = start + 3
        idx: int = right_operand_start_idx
        while idx <= end_statements:
            if self.token_objects[idx].token == ';':
                break
            idx += 1
        self.token_objects[idx].after.append('</letStatement>')
        self.compileExpression(right_operand_start_idx, idx - 1)
        return idx

    def compileWhile(self, start: int, end_statements: int) -> int:
        """
        'while' '(' expression ')' '{' statements '}'
        """
        self.token_objects[start].before.append('<whileStatement>')
        # (expression)
        closing_expression_idx: int = self.findClosingBracket(start + 1, end_statements, '(', ')')
        self.compileExpression(start + 2, closing_expression_idx - 1)
        # {statements}
        closing_statements_idx: int = self.findClosingBracket(closing_expression_idx + 1, end_statements, '{', '}')
        self.token_objects[closing_expression_idx + 2].before.append('<statements>')
        self.compileStatements(closing_expression_idx + 2, closing_statements_idx - 1)
        self.token_objects[closing_statements_idx - 1].after.append('</statements>')
        self.token_objects[closing_statements_idx].after.append('</whileStatement>')
        return closing_statements_idx

    def compileReturn(self, start: int, end_statements: int) -> int:
        """
        'return' expression? ';'
        """
        self.token_objects[start].before.append('<returnStatement>')
        idx: int = start
        while idx <= end_statements:
            if self.token_objects[idx].token == ';':
                break
            idx += 1
        self.token_objects[idx].after.append('</returnStatement>')
        if idx - start > 1:
            self.compileExpression(start + 1, idx - 1)
        return idx

    def findClosingBracket(self, start: int, end: int, open_c: str, close_c: str) -> int:
        bracket_cnt: int = 0
        idx: int = start
        while idx <= end:
            if self.token_objects[idx].token == open_c:
                bracket_cnt += 1
            elif self.token_objects[idx].token == close_c:
                bracket_cnt -= 1
                if bracket_cnt == 0:
                    return idx
            idx += 1
        print('not found')

    def compileIf(self, start, end_statements) -> int:
        """
        'if' '(' expression ')' '{' statements '}' ('else' '{' statements '}')?
        """
        self.token_objects[start].before.append('<ifStatement>')
        # (expression)
        closing_expression_idx: int = self.findClosingBracket(start + 1, end_statements, '(', ')')
        self.compileExpression(start + 2, closing_expression_idx - 1)
        # {statements}
        closing_statements_idx: int = self.findClosingBracket(closing_expression_idx + 1, end_statements, '{', '}')
        self.token_objects[closing_expression_idx + 2].before.append('<statements>')
        self.compileStatements(closing_expression_idx + 2, closing_statements_idx - 1)
        self.token_objects[closing_statements_idx - 1].after.append('</statements>')
        # else
        if (closing_statements_idx == end_statements) or (
                self.token_objects[closing_statements_idx + 1].token != 'else'):
            self.token_objects[closing_statements_idx].after.append('</ifStatement>')
            return closing_statements_idx
        # {statements}
        closing_else_statements_idx: int = self.findClosingBracket(closing_statements_idx + 2, end_statements, '{', '}')
        self.token_objects[closing_statements_idx + 3].before.append('<statements>')
        self.compileStatements(closing_statements_idx + 3, closing_else_statements_idx - 1)
        self.token_objects[closing_else_statements_idx - 1].after.append('</statements>')
        self.token_objects[closing_else_statements_idx].after.append('</ifStatement>')
        return closing_else_statements_idx

    def compileExpression(self, start: int, end_exp: int) -> None:
        """
        term (op term)*
        """
        self.token_objects[start].before.append('<expression>')
        if start > end_exp:
            return
        idx: int = start
        while idx <= end_exp:
            # print(idx, end_exp)
            idx = self.compileTerm(idx, end_exp)
            idx += 1
            if self.token_objects[idx].token in ['+', '-', '*', '/', '&', '|', '<', '>', '=']:
                idx += 1
        self.token_objects[end_exp].after.append('</expression>')

    def compileTerm(self, start: int, end_exp: int) -> None:
        self.token_objects[start].before.append('<term>')
        first_token: str = self.token_objects[start].token
        print(first_token)
        end_idx: int = start
        if end_exp - start < 1:
            end_idx = end_exp
        # '(' expression ')'
        if first_token == '(':
            closing_term: int = self.findClosingBracket(start, end_exp, '(', ')')
            self.compileExpression(start + 1, closing_term - 1)
            end_idx = closing_term
        # unaryOp
        elif first_token in ['-', '~']:
            closing_right_operand: int = self.compileTerm(start + 1, end_exp)
            end_idx = closing_right_operand
        # varName '[' expression ']'
        elif self.token_objects[start + 1].token == '[':
            closing_term: int = self.findClosingBracket(start, end_exp, '[', ']')
            self.compileExpression(start + 2, closing_term - 1)
            end_idx = closing_term
        # subroutineName '(' expressionList ')'
        elif self.token_objects[start + 1].token == '(':
            print('subroutine')
            closing_term: int = self.findClosingBracket(start + 1, end_exp, '(', ')')
            self.compileSubroutineCall(start, closing_term)
            end_idx = closing_term
        # (className | varName) '.' subroutineName '(' expressionList ')'
        elif self.token_objects[start + 1].token == '.':
            print('subroutine')
            closing_term: int = self.findClosingBracket(start + 3, end_exp, '(', ')')
            self.compileSubroutineCall(start, closing_term)
            end_idx = closing_term
        self.token_objects[end_idx].after.append('</term>')
        return end_idx

    def saveToFile(self) -> None:
        self.output += '\n'.join([token_object.format() for token_object in self.token_objects]) + '\n'
        self.path_w = f'{self.p_file.parent}/{self.p_file.stem}.xml'
        with open(self.path_w, mode='w') as f:
            f.write(self.output)


if __name__ == '__main__':
    path: str = sys.argv[1]
    p_path: pathlib.Path = pathlib.Path(path)
    p_file_list: 'list[pathlib.Path]' = []
    if p_path.is_dir():
        p_file_list = list(p_path.glob('**/*.jack'))
    else:
        p_file_list = [p_path]

    for p_file in p_file_list:
        print(p_file)
        tokenizer = JackTokenizer(p_file)
        while tokenizer.hasMoreLine():
            tokenizer.advance()
        compiler = CompilationEngine(tokenizer.token_objects, p_file)
        compiler.compile()
        compiler.saveToFile()
This post is licensed under CC BY 4.0 by the author.

コンピュータシステムの理論と実装 (9)

コンピュータシステムの理論と実装 (11)