指令生成 (Codegen)

指令生成 (Codegen)

由於是簡化版的編譯器,我們不產生中間的表達式 (Intermediate representation, IR) 1,也不會有 Abstract syntax tree (AST) 2,而是直接利用上一篇的螢幕輸出改成對應的 Java bytecode 指令 3。有興趣的可以去參考完整列表 4 看有支援什麼神奇功能。會寫 Java 的也可以參考如何使用反組譯的方式 5 找出對應的 bytecode 來觀察行為。

Jasmin

Jasmin 6 為一 JVM 的組譯器,其會將可讀指令形式的 bytecode 轉換成 .class 的形式 (可執行的 bytecode)。

.j 檔格式

  • 開頭及結尾如下,只需要在一開始開檔時就寫入開頭,等到全部指令產生完畢,離開前再寫入結尾即可。
.source bytecode.j
.class public Main
.super java/lang/Object
.method public static main([Ljava/lang/String;)V
.limit stack 10
.limit locals 10
    
    ; generated instructions
	
    return
.end method

完整程式碼

仔細看的話就會發現跟 ep2 的程式碼幾乎差不多,只有加了寫檔把對應操作的指令寫入 bytecode.j。

下方程式碼的 symbol table 為求簡單所以使用一維陣列實作,僅適用於本範例,容易造成記憶體錯誤且無法套用至多層級的 scope,所以請勿學習。
  • mycompiler.y:
/*	Definition section */
%{
    //Extern variables that communicate with lex
    #include "common.h" 
    // #define YYDEBUG 1
    // int yydebug = 1;

    extern int yylineno;
    extern int yylex();
    extern FILE *yyin;

    void yyerror (char const *s)
    {
        printf("error:%d: %s\n", yylineno, s);
    }

    #define codegen(...) \
        do { \
            for (int i = 0; i < indent_cnt; i++) { \
                fprintf(fout, "\t"); \
            } \
            fprintf(fout, __VA_ARGS__); \
        } while (0)


    /* Symbol table function */
    static void create_symbol(/* ... */);
    static int insert_symbol(const char* id_name);
    static int lookup_symbol(const char* id_name);
    static void dump_symbol();
    const char* get_op_name(op_t op) {
        switch (op) {
            case OP_ADD:
                return "ADD";
            case OP_SUB:
                return "SUB";
            case OP_MUL:
                return "MUL";
            case OP_DIV:
                return "DIV";
            default:
                return "unknown";
        }
    }
    const char* get_op_instruction(op_t op) {
        switch (op) {
            case OP_ADD:
                return "iadd";
            case OP_SUB:
                return "isub";
            case OP_MUL:
                return "imul";
            case OP_DIV:
                return "idiv";
            default:
                return "unknown";
        }
    }

    /* Global variables */
    int example_symbol_cnt = 0;
    #define MAX_SYMBOL_NUM 10
    char *example_symbol[MAX_SYMBOL_NUM] = {};
    int indent_cnt = 0; // control the number of ident in bytecode
    FILE* fout = NULL;
%}

%error-verbose

/* Use variable or self-defined structure to represent
 * nonterminal and token type
 */
%union {
    int val;
    char *id_name;
    op_t op;
}

/* Token without return */
%token DECL PRINT NEWLINE

/* Token with return, which need to sepcify type */
%token <val> NUMLIT
%token <id_name> IDENT

/* Nonterminal with return, which need to sepcify type */
%type <op> AddOp MulOp

/* Yacc will start at this nonterminal */
%start Program

/* Grammar section */
%%

Program
    : StatementList
;

StatementList
    : Statement StatementList
    |
;

Statement
    : DeclStmt
    | PrintStmt
;

DeclStmt
    : DECL IDENT '=' Expression NEWLINE {
        int ref = insert_symbol($<id_name>2);
        printf("IDENT %s, ref: %d\n", $<id_name>2, ref);
        printf("STORE\n");
        free($<id_name>2);
        codegen("istore %d\n", ref);
    }
;

PrintStmt
    : PRINT Expression NEWLINE {
        printf("PRINT\n");
        codegen("getstatic java/lang/System/out Ljava/io/PrintStream;\n");
        codegen("swap\n");
        codegen("invokevirtual java/io/PrintStream/print(I)V\n");
    }
;

Expression
    : AddExpr
;

AddExpr
    : AddExpr AddOp MulExpr {
        printf("%s\n", get_op_name($<op>2));
        codegen("%s\n", get_op_instruction($<op>2));
    }
    | MulExpr
;

AddOp
    : '+'  {
        $<op>$ = OP_ADD;
    }
    | '-' {
        $<op>$ = OP_SUB;
    }
;

MulExpr
    : MulExpr MulOp Operand {
        printf("%s\n", get_op_name($<op>2));
        codegen("%s\n", get_op_instruction($<op>2));
    }
    | Operand
;

MulOp
    : '*' {
        $<op>$ = OP_MUL;
    }
    | '/' {
        $<op>$ = OP_DIV;
    }
;

Operand
    : NUMLIT {
        printf("NUMLIT %d\n", $<val>1);
        codegen("ldc %d\n", $<val>1);
    }
    | IDENT {
        int ref = lookup_symbol($<id_name>1);
        printf("IDENT %s, ref: %d\n", $<id_name>1, ref);
        printf("LOAD\n");
        free($<id_name>1);
        codegen("iload %d\n", ref);
    }
    | '(' Expression ')'
;


%%

/* C code section */
int main(int argc, char *argv[])
{
    if (argc == 2) {
        yyin = fopen(argv[1], "r");
    } else {
        yyin = stdin;
    }

    /* Codegen output init */
    char *bytecode_filename = "bytecode.j";
    fout = fopen(bytecode_filename, "w");
    codegen(".source bytecode.j\n");
    codegen(".class public Main\n");
    codegen(".super java/lang/Object\n");
    codegen(".method public static main([Ljava/lang/String;)V\n");
    codegen(".limit stack 10\n");
    codegen(".limit locals 10\n");
    indent_cnt++;

    create_symbol();
    yyparse();
    dump_symbol();

    /* Codegen end */
    codegen("return\n");
    indent_cnt--;
    codegen(".end method\n");
    fclose(fout);
    fclose(yyin);
    printf("Total lines: %d\n", yylineno);
    return 0;
}

static void create_symbol()
{
    printf("> Create symbol table\n");
    // do nothing...
}

static int insert_symbol(const char* id_name)
{
    printf("> Insert {%s} into symbol table; assign it as ref {%d}\n", 
        id_name, example_symbol_cnt);
    example_symbol[example_symbol_cnt] = strdup(id_name);
    example_symbol_cnt++;
    return example_symbol_cnt - 1;
}

static int lookup_symbol(const char* id_name)
{
    printf("> Lookup in symbol table\n");
    for (int i = 0; i < example_symbol_cnt; i++) {
        if (strcmp(id_name, example_symbol[i]) == 0) {
            return i;
        }
    }
    printf("{%s} not found in symbol table\n", id_name);
    return -1;
}
static void dump_symbol()
{
    printf("> Dump symbol table\n");
    for (int i = 0; i < example_symbol_cnt; i++) {
        free(example_symbol[i]);
    }
}

測試範例

  • input/in01.lc:
decl x = 1 + 4
decl y = 2
decl num = x + y * (3 + 5)
print num
  • Result (其他檔案,如 Makefile 請參考 Source code):
$ make
$ ./mycompiler < input/in01.lc
$ java -jar jasmin.jar -g bytecode.j
Generated: Main.class
$ java Main
21
  • ⊛ Back to top
  • ⊛ Go to bottom