From 1159d779fcb68df643e95c7b24c63055a083188a Mon Sep 17 00:00:00 2001 From: Taehwan Kim Date: Sun, 18 Aug 2024 15:05:56 +0900 Subject: [PATCH] add new crate for build scipt support - fix calculator example use buildscript - add(core) doc comment, version 2.2.2 - fix README and doc comments - rusty_lr version to 2.1.0 --- Cargo.toml | 1 + README.md | 262 +++--- example/calculator/Cargo.toml | 4 + example/calculator/build.rs | 11 + example/calculator/src/main.rs | 4 +- example/calculator/src/parser.rs | 100 ++- example/calculator/src/parser_expanded.rs | 1 + rusty_lr/Cargo.toml | 7 +- rusty_lr/src/lib.rs | 169 ++-- rusty_lr_buildscript/Cargo.toml | 20 + rusty_lr_buildscript/src/lib.rs | 980 ++++++++++++++++++++++ rusty_lr_buildscript/src/output.rs | 10 + rusty_lr_buildscript/src/split.rs | 30 + rusty_lr_buildscript/src/utils.rs | 23 + rusty_lr_core/Cargo.toml | 2 +- rusty_lr_core/src/lib.rs | 1 + 16 files changed, 1312 insertions(+), 313 deletions(-) create mode 100644 example/calculator/build.rs create mode 100644 example/calculator/src/parser_expanded.rs create mode 100644 rusty_lr_buildscript/Cargo.toml create mode 100644 rusty_lr_buildscript/src/lib.rs create mode 100644 rusty_lr_buildscript/src/output.rs create mode 100644 rusty_lr_buildscript/src/split.rs create mode 100644 rusty_lr_buildscript/src/utils.rs diff --git a/Cargo.toml b/Cargo.toml index 8bdaecf..ee3f1f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "rusty_lr_core", "rusty_lr_derive", "rusty_lr_parser", + "rusty_lr_buildscript", "rusty_lr_executable", "example/calculator", "example/calculator_u8", diff --git a/README.md b/README.md index f7dd7d4..b1972b2 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,28 @@ # RustyLR [![crates.io](https://img.shields.io/crates/v/rusty_lr.svg)](https://crates.io/crates/rusty_lr) [![docs.rs](https://docs.rs/rusty_lr/badge.svg)](https://docs.rs/rusty_lr) -for [proc-macro](#proc-macro) - -[![crates.io](https://img.shields.io/crates/v/rustylr.svg)](https://crates.io/crates/rustylr) -for [executable](#executable-rustylr) yacc-like LR(1) and LALR(1) Deterministic Finite Automata (DFA) generator from Context Free Grammar (CFGs). -RustyLR provides both [executable](#executable-rustylr) and [procedural macros](#proc-macro) to generate LR(1) and LALR(1) parser. +RustyLR provides [procedural macros](#proc-macro) and [buildscript tools](#integrating-with-buildrs) to generate LR(1) and LALR(1) parser. The generated parser will be a pure Rust code, and the calculation of building DFA will be done at compile time. Reduce action can be written in Rust code, -and the error messages are [readable and detailed](#readable-error-messages-with-codespan) with [executable](#executable-rustylr). -For huge and complex grammars, it is recommended to use the [executable](#executable-rustylr) version. +and the error messages are [readable and detailed](#readable-error-messages-with-codespan). +For huge and complex grammars, it is recommended to use the [buildscipt](#integrating-with-buildrs). -By default, RustyLR uses `std::collections::HashMap` for the parser tables. -If you want to use `FxHashMap` from [`rustc-hash`](https://github.com/rust-lang/rustc-hash), add `features=["fxhash"]` to your `Cargo.toml`. -```toml -[dependencies] -rusty_lr = { version = "...", features = ["fxhash"] } -``` +#### `features` in `Cargo.toml` + - `build` : Enable buildscript tools. + - `fxhash` : In parser table, replace `std::collections::HashMap` with `FxHashMap` from [`rustc-hash`](https://github.com/rust-lang/rustc-hash). ### Example ```rust // this define `EParser` struct // where `E` is the start symbol lr1! { - // userdata type - %userdata i32; - // token type - %tokentype char; - // start symbol - %start E; - // eof symbol - %eof '\0'; + %userdata i32; // userdata type + %tokentype char; // token type + %start E; // start symbol + %eof '\0'; // eof token // token definition %token zero '0'; @@ -53,19 +42,19 @@ lr1! { %token space ' '; // conflict resolving - %left [plus star]; // reduce first for token 'plus', 'star' + %left [plus star]; // reduce first for token 'plus', 'star' // context-free grammars - Digit(char): [zero-nine]; // character set '0' to '9' + Digit(char): [zero-nine]; // character set '0' to '9' - Number(i32) // type assigned to production rule `Number` - : space* Digit+ space* // regex pattern + Number(i32) // type assigned to production rule `Number` + : space* Digit+ space* // regex pattern { Digit.into_iter().collect::().parse().unwrap() }; // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ this will be the value of `Number` - // reduce action written in Rust code + // reduce action written in Rust code A(f32): A plus a2=A { - *data += 1; // access userdata by `data` + *data += 1; // access userdata by `data` println!( "{:?} {:?} {:?}", A, plus, a2 ); A + a2 } @@ -84,12 +73,9 @@ lr1! { } ``` ```rust -// generate `EParser` -let parser = EParser::new(); -// create context -let mut context = parser.begin(); -// define userdata -let mut userdata: i32 = 0; +let parser = EParser::new(); // generate `EParser` +let mut context = parser.begin(); // create context +let mut userdata: i32 = 0; // define userdata let input_sequence = "1 + 2 * ( 3 + 4 )"; @@ -97,7 +83,7 @@ let input_sequence = "1 + 2 * ( 3 + 4 )"; for token in input_sequence.chars() { match parser.feed(&mut context, token, &mut userdata) { // ^^^^^ ^^^^^^^^^^^^ userdata passed here as `&mut i32` - // |- feed token + // feed token Ok(_) => {} Err(e) => { match e { @@ -114,11 +100,9 @@ for token in input_sequence.chars() { } } } -// feed `eof` token -parser.feed(&mut context, '\0', &mut userdata).unwrap(); +parser.feed(&mut context, '\0', &mut userdata).unwrap(); // feed `eof` token -// res = value of start symbol -let res = context.accept(); +let res = context.accept(); // get the value of start symbol println!("{}", res); println!("userdata: {}", userdata); ``` @@ -126,13 +110,7 @@ println!("userdata: {}", userdata); ### Readable error messages (with [codespan](https://github.com/brendanzab/codespan)) ![images/error1.png](images/error1.png) ![images/error2.png](images/error2.png) - -## Contents - - [Proc-macro](#proc-macro) - - [Start Parsing](#start-parsing) - - [Error Handling](#error-handling) - - [Syntax](#syntax) - - [Executable `rustylr`](#executable-rustylr) + - This error message is generated by the buildscript tool, not the procedural macros. ## Features - pure Rust implementation @@ -141,12 +119,20 @@ println!("userdata: {}", userdata); - customizable reduce action - resolving conflicts of ambiguous grammar - regex patterns partially supported - - executable for generating parser tables + - tools for integrating with `build.rs` + +## Contents + - [Proc-macro](#proc-macro) + - [Integrating with `build.rs`](#integrating-with-buildrs) + - [Start Parsing](#start-parsing) + - [Error Handling](#error-handling) + - [Syntax](#syntax) + ## proc-macro Below procedural macros are provided: - - `lr1!` : LR(1) parser - - `lalr1!` : LALR(1) parser + - `lr1!` : generate LR(1) parser + - `lalr1!` : generate LALR(1) parser These macros will generate structs: - `Parser` : contains DFA tables and production rules @@ -159,6 +145,68 @@ These macros will generate structs: All structs above are prefixed by ``. In most cases, what you want is the `Parser` and `ParseError` structs, and the others are used internally. +## Integrating with `build.rs` +This buildscripting tool will provide much more detailed, pretty-printed error messages than the procedural macros. +If you are writing a huge, complex grammar, it is recommended to use buildscript than the procedural macros. +Generated code will contain the same structs and functions as the procedural macros. In your actual source code, you can `include!` the generated file. + +The program searches for `%%` in the input file, not the `lr1!`, `lalr1!` macro. +The contents before `%%` will be copied into the output file as it is. +And the context-free grammar must be followed by `%%`. + +```rust +// parser.rs +use some_crate::some_module::SomeStruct; + +enum SomeTypeDef { + A, + B, + C, +} + +%% // <-- input file splitted here + +%tokentype u8; +%start E; +%eof b'\0'; + +%token a b'a'; +%token lparen b'('; +%token rparen b')'; + +E: lparen E rparen + | P + ; + +P: a; +``` + +You must enable the feature `build` to use in the build script. +```toml +[build-dependencies] +rusty_lr = { version = "...", features = ["build"] } +``` + +```rust +// build.rs +use rusty_lr::build; + +fn main() { + println!("cargo::rerun-if-changed=src/parser.rs"); + + let output = format!("{}/parser.rs", std::env::var("OUT_DIR").unwrap()); + build::Builder::new() + .file("src/parser.rs") // path to the input file + // .lalr() // to generate LALR(1) parser + .build(&output); // path to the output file +} +``` + +In your source code, include the generated file. +```rust +include!(concat!(env!("OUT_DIR"), "/parser.rs")); +``` + ## Start Parsing The `Parser` struct has the following functions: - `new()` : create new parser @@ -285,7 +333,6 @@ Example ```rust -lr1! { %tokentype u8; %token zero b'0'; @@ -295,7 +342,6 @@ lr1! { // 'zero' and 'one' will be replaced by b'0' and b'1' respectively E: zero one; -} ``` @@ -312,12 +358,10 @@ Example ```rust -lr1! { %start E; // this internally generate augmented rule -> E eof E: ... ; -} ``` @@ -336,11 +380,9 @@ Example ```rust -lr1! { %eof b'\0'; // you can access eof terminal symbol by 'eof' in the grammar // without %token eof ...; -} ``` @@ -359,10 +401,9 @@ Example ```rust struct MyUserData { ... } -lr1! { ... + %userdata MyUserData; -} ... @@ -396,7 +437,6 @@ Example ```rust -lr1! { // define tokens %token plus '+'; %token hat '^'; @@ -407,11 +447,9 @@ lr1! { // shift first for token 'hat' %right hat; -} ``` ```rust -lr1! { // define tokens %token zero b'0'; %token one b'1'; @@ -420,7 +458,6 @@ lr1! { // shift first for tokens in range 'zero' to 'nine' %shift [zero-nine]; -} ``` @@ -457,9 +494,7 @@ Example This production rule defines non-terminal `E` to be `A`, then zero or more `plus`, then `D` mapped to variable `d`. For more information, please refer to the [Accessing token data in ReduceAction](#accessing-token-data-in-reduceaction) section below. ```rust -lr1! { E: A plus* d=D; -} ``` @@ -509,9 +544,7 @@ Example ```rust -lr1! { E(MyType<...>): ... Tokens ... ; -} ``` @@ -542,19 +575,16 @@ Example Omitting `ReduceAction`: ```rust -lr1! { NoRuleType: ... ; RuleTypeI32(i32): ... { 0 } ; // RuleTypeI32 will be chosen E(i32): NoRuleType NoRuleType RuleTypeI32 NoRuleType; -} ``` Returning `Result<(),String>` from ReduceAction: ```rust -lr1! { // set Err variant type to String %err String; @@ -569,7 +599,6 @@ E(i32): A div a2=A { }; A(i32): ... ; -} ``` @@ -598,7 +627,6 @@ Example ```rust -lr1! { %token plus ...; // one or more 'A', then optional 'plus', then zero or more 'B' @@ -625,7 +653,6 @@ E(f32) : A+ plus? b=B* minus_or_star=[minus star] A(i32): ... ; B(f32): ... ; -} ``` @@ -649,14 +676,12 @@ enum MyErrorType { ErrVar1, ErrVar2, ErrVar3(T), - ... } -lr1! { +... -%err MyErrorType ; -} +%err MyErrorType ; ... @@ -689,7 +714,6 @@ Example ```rust -lr1! { %token plus ...; A(i32) : ... ; @@ -700,7 +724,6 @@ E(i32) : A! A A!; B: A*!; // Vec will be built from the value of A, and then ignored C: A!*; // A will be ignored first, and then repeatance pattern will be applied -} ``` @@ -709,90 +732,3 @@ C: A!*; // A will be ignored first, and then repeatance pattern will be applied - - - - - -## executable `rustylr` -An executable version of `lr1!` and `lalr1!` macro. -Converts a context-free grammar into a deterministic finite automaton (DFA) tables, -and generates a Rust code that can be used as a parser for that grammar. - -``` -cargo install rustylr -``` - -This executable will provide much more detailed, pretty-printed error messages than the procedural macros. -If you are writing a huge, complex grammar, it is recommended to use this executable than the procedural macros. -`--verbose` option is useful for debugging the grammar. It will print where the auto-generated rules are originated from and the resolving process of shift/reduce conflicts. [like](images/example1.png) [this](images/example2.png) - -Although it is convenient to use the proc-macros for small grammars, -since modern IDEs feature (rust-analyzer's auto completion, inline error messages) could be enabled. - -This program searches for `%%` in the input file. ( Not the `lr1!`, `lalr1!` macro ) - -The contents before `%%` will be copied into the output file as it is. -Context-free grammar must be followed by `%%`. -Each line must follow the syntax of [rusty_lr#syntax](#syntax) - -```rust -// my_grammar.rs -use some_crate::some_module::SomeStruct; - -enum SomeTypeDef { - A, - B, - C, -} - -%% // <-- input file splitted here - -%tokentype u8; -%start E; -%eof b'\0'; - -%token a b'a'; -%token lparen b'('; -%token rparen b')'; - -E: lparen E rparen - | P - ; - -P: a; -``` - -Calling the command will generate a Rust code `my_parser.rs`. -``` -$ rustylr my_grammar.rs my_parser.rs --verbose -``` - - -Possible options can be found by `--help`. -``` -$ rustylr --help -Usage: rustylr [OPTIONS] [OUTPUT_FILE] - -Arguments: - - input_file to read - - [OUTPUT_FILE] - output_file to write - - [default: out.tab.rs] - -Options: - --no-format - do not rustfmt the output - - -l, --lalr - build LALR(1) parser - - -v, --verbose - print debug information. - - print the auto-generated rules, and where they are originated from. - print the shift/reduce conflicts, and the resolving process. -``` \ No newline at end of file diff --git a/example/calculator/Cargo.toml b/example/calculator/Cargo.toml index 8015a33..29147fb 100644 --- a/example/calculator/Cargo.toml +++ b/example/calculator/Cargo.toml @@ -5,3 +5,7 @@ edition = "2021" [dependencies] rusty_lr = { path = "../../rusty_lr" } + + +[build-dependencies] +rusty_lr = { path = "../../rusty_lr", features = ["build"] } diff --git a/example/calculator/build.rs b/example/calculator/build.rs new file mode 100644 index 0000000..a7099af --- /dev/null +++ b/example/calculator/build.rs @@ -0,0 +1,11 @@ +use rusty_lr::build; + +fn main() { + println!("cargo::rerun-if-changed=src/parser.rs"); + let output = format!("{}/parser.rs", std::env::var("OUT_DIR").unwrap()); + + build::Builder::new() + .file("src/parser.rs") + .lalr() + .build(&output); +} diff --git a/example/calculator/src/main.rs b/example/calculator/src/main.rs index e30ab22..9fe5420 100644 --- a/example/calculator/src/main.rs +++ b/example/calculator/src/main.rs @@ -1,4 +1,6 @@ -mod parser; +mod parser_expanded; + +use parser_expanded as parser; fn main() { use parser::Token; diff --git a/example/calculator/src/parser.rs b/example/calculator/src/parser.rs index 5144923..da6ee8a 100644 --- a/example/calculator/src/parser.rs +++ b/example/calculator/src/parser.rs @@ -1,8 +1,3 @@ -#![allow(unused_imports)] - -use rusty_lr::lalr1; -use rusty_lr::lr1; - #[derive(Debug, Clone, Copy)] pub enum Token { Num(i32), @@ -46,60 +41,61 @@ impl Ord for Token { } } +%% + // this define struct `EParser` // where 'E' is the start symbol -lalr1! { - // type of userdata - %userdata i32; - // type of token ( as Terminal symbol ) - %tokentype Token; - // start symbol - %start E; - // eof symbol; for augmented rule generation - %eof Token::Eof; +// type of userdata +%userdata i32; +// type of token ( as Terminal symbol ) +%tokentype Token; - // define tokens - %token num Token::Num(0); // `num` maps to `Token::Num(0)` - %token plus Token::Plus; - %token star Token::Star; - %token lparen Token::LParen; - %token rparen Token::RParen; +// start symbol +%start E; +// eof symbol; for augmented rule generation +%eof Token::Eof; - // resolving shift/reduce conflict - %left plus; - %left star; +// define tokens +%token num Token::Num(0); // `num` maps to `Token::Num(0)` +%token plus Token::Plus; +%token star Token::Star; +%token lparen Token::LParen; +%token rparen Token::RParen; - // data that each token holds can be accessed by its name - // s is slice of shifted terminal symbols captured by current rule - // userdata can be accessed by `data` ( &mut i32, for this situation ) - A(i32) : A plus a2=A { - println!("{:?} {:?} {:?}", A, plus, a2 ); - // ^ ^ ^ - // | | |- value of 2nd 'A' - // | |- Token - // |- value of 1st 'A' - *data += 1; - A + a2 // --> this will be new value of current 'A' - // ^ ^ - // | |- value of 2nd 'A' - // |- value of 1st 'A' - } - | M - ; +// resolving shift/reduce conflict +%left plus; +%left star; - M(i32) : M star m2=M { M * m2 } - | P - ; - - P(i32) : num { - if let Token::Num(n) = num { n } - else { return Err(format!("{:?}", num)); } - // ^^^^^^^^^^^^^^^^^^^^^^^^^^ - // reduce action returns Result<(), String> +// data that each token holds can be accessed by its name +// s is slice of shifted terminal symbols captured by current rule +// userdata can be accessed by `data` ( &mut i32, for this situation ) +A(i32) : A plus a2=A { + println!("{:?} {:?} {:?}", A, plus, a2 ); + // ^ ^ ^ + // | | |- value of 2nd 'A' + // | |- Token + // |- value of 1st 'A' + *data += 1; + A + a2 // --> this will be new value of current 'A' + // ^ ^ + // | |- value of 2nd 'A' + // |- value of 1st 'A' } - | lparen E rparen { E } - ; + | M + ; + +M(i32) : M star m2=M { M * m2 } + | P + ; - E(i32) : A; +P(i32) : num { + if let Token::Num(n) = num { n } + else { return Err(format!("{:?}", num)); } + // ^^^^^^^^^^^^^^^^^^^^^^^^^^ + // reduce action returns Result<(), String> } + | lparen E rparen { E } + ; + +E(i32) : A; diff --git a/example/calculator/src/parser_expanded.rs b/example/calculator/src/parser_expanded.rs new file mode 100644 index 0000000..a118a21 --- /dev/null +++ b/example/calculator/src/parser_expanded.rs @@ -0,0 +1 @@ +include!(concat!(env!("OUT_DIR"), "/parser.rs")); diff --git a/rusty_lr/Cargo.toml b/rusty_lr/Cargo.toml index fa6b25d..87fda2c 100644 --- a/rusty_lr/Cargo.toml +++ b/rusty_lr/Cargo.toml @@ -1,17 +1,19 @@ [package] name = "rusty_lr" -version = "2.0.4" +version = "2.1.0" edition = "2021" license = "MIT" description = "yacc-like, LR(1) and LALR(1) parser generator with custom reduce action" repository = "https://github.com/ehwan/RustyLR" readme = "../README.md" keywords = ["parser", "yacc", "context-free-grammar", "lr", "compiler"] -categories = ["parsing", "compilers"] +categories = ["parsing", "compilers", "parser-implementations"] [dependencies] rusty_lr_core = "2.2" rusty_lr_derive = "1.6" +rusty_lr_buildscript = { version = "0.1", optional = true } +# rusty_lr_buildscript = { path = "../rusty_lr_buildscript", optional = true } # rusty_lr_core = { path = "../rusty_lr_core" } # rusty_lr_derive = { path = "../rusty_lr_derive" } @@ -19,6 +21,7 @@ rusty_lr_derive = "1.6" default = [] fxhash = ["rusty_lr_core/fxhash"] builder = ["rusty_lr_core/builder"] +build = ["dep:rusty_lr_buildscript"] # default = ["core", "derive"] # core = ["rusty_lr_core"] # derive = ["rusty_lr_derive"] diff --git a/rusty_lr/src/lib.rs b/rusty_lr/src/lib.rs index 86e8067..5a822ef 100644 --- a/rusty_lr/src/lib.rs +++ b/rusty_lr/src/lib.rs @@ -1,18 +1,15 @@ //! # RustyLR //! yacc-like LR(1) and LALR(1) Deterministic Finite Automata (DFA) generator from Context Free Grammar (CFGs). //! -//! RustyLR provides both [executable](#executable-rustylr) and [procedural macros](#proc-macro) to generate LR(1) and LALR(1) parser. +//! RustyLR provides [procedural macros](#proc-macro) and [buildscript tools](#integrating-with-buildrs) to generate LR(1) and LALR(1) parser. //! The generated parser will be a pure Rust code, and the calculation of building DFA will be done at compile time. //! Reduce action can be written in Rust code, -//! and the error messages are readable and detailed with [executable](#executable-rustylr). -//! For huge and complex grammars, it is recommended to use the [executable](#executable-rustylr) version. +//! and the error messages are **readable and detailed**. +//! For huge and complex grammars, it is recommended to use the [buildscipt](#integrating-with-buildrs). //! -//! By default, RustyLR uses [`std::collections::HashMap`] for the parser tables. -//! If you want to use `FxHashMap` from [`rustc-hash`](https://github.com/rust-lang/rustc-hash), add `features=["fxhash"]` to your `Cargo.toml`. -//! ```toml -//! [dependencies] -//! rusty_lr = { version = "...", features = ["fxhash"] } -//! ``` +//! #### `features` in `Cargo.toml` +//! - `build` : Enable buildscript tools. +//! - `fxhash` : In parser table, replace `std::collections::HashMap` with `FxHashMap` from [`rustc-hash`](https://github.com/rust-lang/rustc-hash). //! //! ## Features //! - pure Rust implementation @@ -21,7 +18,7 @@ //! - customizable reduce action //! - resolving conflicts of ambiguous grammar //! - regex patterns partially supported -//! - executable for generating parser tables +//! - tools for integrating with `build.rs` //! //! ## proc-macro //! Below procedural macros are provided: @@ -39,6 +36,68 @@ //! All structs above are prefixed by ``. //! In most cases, what you want is the `Parser` and `ParseError` structs, and the others are used internally. //! +//! ## Integrating with `build.rs` +//! This buildscripting tool will provide much more detailed, pretty-printed error messages than the procedural macros. +//! If you are writing a huge, complex grammar, it is recommended to use buildscript than the procedural macros. +//! Generated code will contain the same structs and functions as the procedural macros. In your actual source code, you can `include!` the generated file. +//! +//! The program searches for `%%` in the input file, not the `lr1!`, `lalr1!` macro. +//! The contents before `%%` will be copied into the output file as it is. +//! And the context-free grammar must be followed by `%%`. +//! +//! ```rust +//! // parser.rs +//! use some_crate::some_module::SomeStruct; +//! +//! enum SomeTypeDef { +//! A, +//! B, +//! C, +//! } +//! +//! %% // <-- input file splitted here +//! +//! %tokentype u8; +//! %start E; +//! %eof b'\0'; +//! +//! %token a b'a'; +//! %token lparen b'('; +//! %token rparen b')'; +//! +//! E: lparen E rparen +//! | P +//! ; +//! +//! P: a; +//! ``` +//! +//! You must enable the feature `build` to use in the build script. +//! ```toml +//! [build-dependencies] +//! rusty_lr = { version = "...", features = ["build"] } +//! ``` +//! +//! ```rust +//! // build.rs +//! use rusty_lr::build; +//! +//! fn main() { +//! println!("cargo::rerun-if-changed=src/parser.rs"); +//! +//! let output = format!("{}/parser.rs", std::env::var("OUT_DIR").unwrap()); +//! build::Builder::new() +//! .file("src/parser.rs") // path to the input file +//! // .lalr() // to generate LALR(1) parser +//! .build(&output); // path to the output file +//! } +//! ``` +//! +//! In your source code, include the generated file. +//! ```rust +//! include!(concat!(env!("OUT_DIR"), "/parser.rs")); +//! ``` +//! //! ## Start Parsing //! The `Parser` struct has the following functions: //! - `new()` : create new parser @@ -116,92 +175,14 @@ //! Syntax can be found in [repository](https://github.com/ehwan/RustyLR/tree/main?tab=readme-ov-file#syntax). //! //! -//! ## executable `rustylr` -//! An executable version of `lr1!` and `lalr1!` macro. -//! Converts a context-free grammar into a deterministic finite automaton (DFA) tables, -//! and generates a Rust code that can be used as a parser for that grammar. -//! -//! ``` -//! cargo install rustylr -//! ``` -//! -//! This executable will provide much more detailed, pretty-printed error messages than the procedural macros. -//! If you are writing a huge, complex grammar, it is recommended to use this executable than the procedural macros. -//! `--verbose` option is useful for debugging the grammar. -//! It will print where the auto-generated rules are originated from and the resolving process of shift/reduce conflicts. -//! [like](https://github.com/ehwan/RustyLR/blob/main/images/example1.png) [this](https://github.com/ehwan/RustyLR/blob/main/images/example2.png) -//! -//! Although it is convenient to use the proc-macros for small grammars, -//! since modern IDEs feature (rust-analyzer's auto completion, inline error messages) could be enabled. -//! -//! This program searches for `%%` in the input file. ( Not the `lr1!`, `lalr1!` macro ) -//! -//! The contents before `%%` will be copied into the output file as it is. -//! Context-free grammar must be followed by `%%`. -//! Each line must follow the syntax of [rusty_lr#syntax](#syntax) -//! -//! ```rust -//! // my_grammar.rs -//! use some_crate::some_module::SomeStruct; -//! -//! enum SomeTypeDef { -//! A, -//! B, -//! C, -//! } -//! -//! %% // <-- input file splitted here -//! -//! %tokentype u8; -//! %start E; -//! %eof b'\0'; -//! -//! %token a b'a'; -//! %token lparen b'('; -//! %token rparen b')'; -//! -//! E: lparen E rparen -//! | P -//! ; -//! -//! P: a; -//! ``` -//! -//! Calling the command will generate a Rust code `my_parser.rs`. -//! ``` -//! $ rustylr my_grammar.rs my_parser.rs --verbose -//! ``` -//! -//! -//! Possible options can be found by `--help`. -//! ``` -//! $ rustylr --help -//! Usage: rustylr [OPTIONS] [OUTPUT_FILE] -//! -//! Arguments: -//! -//! input_file to read -//! -//! [OUTPUT_FILE] -//! output_file to write -//! -//! [default: out.tab.rs] -//! -//! Options: -//! --no-format -//! do not rustfmt the output -//! -//! -l, --lalr -//! build LALR(1) parser -//! -//! -v, --verbose -//! print debug information. -//! -//! print the auto-generated rules, and where they are originated from. -//! print the shift/reduce conflicts, and the resolving process. -//! ``` // re-exports pub use rusty_lr_core::*; pub use rusty_lr_derive::*; + +/// tools for build.rs +#[cfg(feature = "build")] +pub mod build { + pub use rusty_lr_buildscript::*; +} diff --git a/rusty_lr_buildscript/Cargo.toml b/rusty_lr_buildscript/Cargo.toml new file mode 100644 index 0000000..d866cb8 --- /dev/null +++ b/rusty_lr_buildscript/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "rusty_lr_buildscript" +version = "0.1.0" +edition = "2021" +license = "MIT" +description = "buildscipt tools for rusty_lr" +repository = "https://github.com/ehwan/RustyLR" +readme = "../README.md" +keywords = ["parser", "yacc", "context-free-grammar", "lr", "compiler"] +categories = ["parsing"] + + +[dependencies] +proc-macro2 = { version = "1.0.86", features = ["span-locations"] } +quote = "1.0" +# rusty_lr_parser = { path = "../rusty_lr_parser" } +rusty_lr_parser = "3.4" +# rusty_lr_core = { path = "../rusty_lr_core", features = ["fxhash", "builder"] } +rusty_lr_core = { version = "2.2", features = ["fxhash", "builder"] } +codespan-reporting = "0.11" diff --git a/rusty_lr_buildscript/src/lib.rs b/rusty_lr_buildscript/src/lib.rs new file mode 100644 index 0000000..9823df9 --- /dev/null +++ b/rusty_lr_buildscript/src/lib.rs @@ -0,0 +1,980 @@ +//! Build script for rusty_lr +//! +//! This crate is private and not intended to be used directly. +//! Please use the [`rusty_lr`](https://crates.io/crates/rusty_lr) crate instead. +//! +//! ```no_run +//! fn main() { +//! println!("cargo::rerun-if-changed=src/parser/parser.rs"); +//! +//! let output_dir = std::env::var("OUT_DIR").unwrap(); +//! let output = format!("{}/parser.rs", output_dir); +//! Builder::new() +//! .file("src/parser/parser.rs") +//! .build(&output); +//! } +//! + +pub mod output; +mod split; +mod utils; + +use codespan_reporting::diagnostic::Diagnostic; +use codespan_reporting::diagnostic::Label; +use codespan_reporting::files::SimpleFiles; +use codespan_reporting::term; +use codespan_reporting::term::termcolor::ColorChoice; +use codespan_reporting::term::termcolor::StandardStream; + +use proc_macro2::Ident; +use proc_macro2::Span; +use proc_macro2::TokenStream; + +use quote::quote; +use rusty_lr_core::ShiftedRule; +use rusty_lr_parser::error::ArgError; +use rusty_lr_parser::error::EmitError; +use rusty_lr_parser::error::ParseArgError; +use rusty_lr_parser::error::ParseError; + +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::fs::read; +use std::fs::write; + +/// Main entry for the build script +pub struct Builder { + /// input_file to read + input_file: Option, + + /// build LALR(1) parser + lalr: bool, + + /// print debug information. + /// + /// print the auto-generated rules, and where they are originated from. + /// print the shift/reduce conflicts, and the resolving process. + verbose: bool, +} + +impl Builder { + pub fn new() -> Self { + Self { + input_file: None, + lalr: false, + verbose: false, + } + } + + /// set input file + pub fn file(&mut self, filename: &str) -> &mut Self { + self.input_file = Some(filename.to_string()); + self + } + + /// set to build LALR(1) parser + pub fn lalr(&mut self) -> &mut Self { + self.lalr = true; + self + } + + /// print debug information to stdout. + pub fn verbose(&mut self) -> &mut Self { + self.verbose = true; + self + } + + /// build and emit code to output file + pub fn build(&self, output_file: &str) { + let output = match self.build_impl() { + Ok(output) => { + let stream1 = output.user_stream; + let stream2 = output.generated_stream; + quote! { + #stream1 + #stream2 + } + } + Err(_) => { + panic!("build failed"); + } + }; + + write(output_file, output.to_string()).expect("Failed to write to file"); + } + + /// for internal use + pub fn build_impl(&self) -> Result { + if self.input_file.is_none() { + eprintln!("Input file not set"); + return Err("Input file not set".to_string()); + } + + let input_file = self.input_file.as_ref().unwrap(); + // read file + let input_bytes = match read(input_file) { + Ok(bytes) => bytes, + Err(e) => { + let message = format!("Error reading file: {}", e); + eprintln!("{}", message); + return Err(message); + } + }; + + let str = match String::from_utf8(input_bytes) { + Ok(str) => str, + Err(e) => { + let message = format!("Error reading utf-8: {}", e); + eprintln!("{}", message); + return Err(message); + } + }; + + let mut files = SimpleFiles::new(); + let file_id = files.add(input_file, str.clone()); + + // lex with proc-macro2 + let token_stream: TokenStream = match str.parse() { + Ok(token_stream) => token_stream, + Err(e) => { + let range = e.span().byte_range(); + let diag = Diagnostic::error() + .with_message("Lexing error") + .with_labels(vec![ + Label::primary(file_id, range).with_message(e.to_string()) + ]); + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + return Err("Lexing error".to_string()); + } + }; + + // split stream by '%%' + let (output_stream, macro_stream) = match split::split_stream(token_stream) { + Ok((output_stream, macro_stream)) => (output_stream, macro_stream), + Err(_) => { + let diag = Diagnostic::error() + .with_message("Cannot find `%%`") + .with_notes(vec![ + "Please put `%%` to separate the code part and the context-free grammar part" + .to_string(), + ]); + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + return Err(diag.message); + } + }; + + let grammar_args = match rusty_lr_parser::grammar::Grammar::parse_args(macro_stream) { + Ok(grammar_args) => grammar_args, + Err(e) => { + let diag = + match e { + ParseArgError::MacroLineParse { span, message } => { + let range = span.byte_range(); + + Diagnostic::error() + .with_message("Parse Failed") + .with_labels(vec![ + Label::primary(file_id, range).with_message("Error here") + ]) + .with_notes(vec![message]) + } + ParseArgError::MacroLineParseEnd { message } => Diagnostic::error() + .with_message("Parse Failed") + .with_notes(vec![message]), + + _ => { + let message = e.short_message(); + let span = e.span().byte_range(); + Diagnostic::error().with_message(message).with_labels(vec![ + Label::primary(file_id, span).with_message("occured here"), + ]) + } + }; + + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + return Err(diag.message); + } + }; + match rusty_lr_parser::grammar::Grammar::arg_check_error(&grammar_args) { + Ok(_) => {} + Err(e) => { + let diag = match e { + ArgError::MultipleModulePrefixDefinition( + (span1, tokenstream1), + (span2, tokenstream2), + ) => { + let range1 = utils::span_stream_range(span1, tokenstream1); + let range2 = utils::span_stream_range(span2, tokenstream2); + + Diagnostic::error() + .with_message("Multiple %moduleprefix definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec![ + "Only one %moduleprefix definition is allowed".to_string() + ]) + } + ArgError::MultipleUserDataDefinition( + (span1, tokenstream1), + (span2, tokenstream2), + ) => { + let range1 = utils::span_stream_range(span1, tokenstream1); + let range2 = utils::span_stream_range(span2, tokenstream2); + + Diagnostic::error() + .with_message("Multiple %userdata definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes( + vec!["Only one %userdata definition is allowed".to_string()], + ) + } + ArgError::MultipleErrorDefinition( + (span1, tokenstream1), + (span2, tokenstream2), + ) => { + let range1 = utils::span_stream_range(span1, tokenstream1); + let range2 = utils::span_stream_range(span2, tokenstream2); + + Diagnostic::error() + .with_message("Multiple %error definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec!["Only one %error definition is allowed".to_string()]) + } + ArgError::MultipleTokenTypeDefinition( + (span1, tokenstream1), + (span2, tokenstream2), + ) => { + let range1 = utils::span_stream_range(span1, tokenstream1); + let range2 = utils::span_stream_range(span2, tokenstream2); + + Diagnostic::error() + .with_message("Multiple %tokentype definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec![ + "Only one %tokentype definition is allowed".to_string() + ]) + } + ArgError::MultipleEofDefinition( + (span1, tokenstream1), + (span2, tokenstream2), + ) => { + let range1 = utils::span_stream_range(span1, tokenstream1); + let range2 = utils::span_stream_range(span2, tokenstream2); + + Diagnostic::error() + .with_message("Multiple %eof definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec!["Only one %eof definition is allowed".to_string()]) + } + ArgError::MultipleStartDefinition(ident1, ident2) => { + let range1 = ident1.span().byte_range(); + let range2 = ident2.span().byte_range(); + + Diagnostic::error() + .with_message("Multiple %start definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec!["Only one %start definition is allowed".to_string()]) + } + + ArgError::StartNotDefined => Diagnostic::error() + .with_message("%start not defined") + .with_labels(vec![]) + .with_notes(vec![ + "%start must be defined".to_string(), + ">>> %start ".to_string(), + ]), + ArgError::EofNotDefined => Diagnostic::error() + .with_message("%eof not defined") + .with_labels(vec![]) + .with_notes(vec![ + "%eof must be defined".to_string(), + ">>> %eof ".to_string(), + ]), + ArgError::TokenTypeNotDefined => Diagnostic::error() + .with_message("%tokentype not defined") + .with_labels(vec![]) + .with_notes(vec![ + "%tokentype must be defined".to_string(), + ">>> %tokentype ".to_string(), + ]), + _ => { + let message = e.short_message(); + let span = e.span().byte_range(); + Diagnostic::error() + .with_message(message) + .with_labels(vec![ + Label::primary(file_id, span).with_message("occured here") + ]) + } + }; + + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + return Err(diag.message); + } + } + + // parse lines + let grammar = match rusty_lr_parser::grammar::Grammar::from_grammar_args(grammar_args) { + Ok(grammar) => grammar, + Err(e) => { + let diag = match e { + ParseError::MultipleRuleDefinition(ident1, ident2) => { + let range1 = ident1.span().byte_range(); + let range2 = ident2.span().byte_range(); + + Diagnostic::error() + .with_message("Multiple rule definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec!["Rule name must be unique".to_string()]) + } + + ParseError::MultipleReduceDefinition { terminal, old, new } => { + let old_range = old.0.byte_range().start..old.1.byte_range().end; + let old_string = match old.2 { + rusty_lr_core::ReduceType::Left => "%left", + rusty_lr_core::ReduceType::Right => "%right", + }; + let new_range = new.0.byte_range().start..new.1.byte_range().end; + let new_string = match new.2 { + rusty_lr_core::ReduceType::Left => "%left", + rusty_lr_core::ReduceType::Right => "%right", + }; + + Diagnostic::error() + .with_message("Multiple reduce definition") + .with_labels(vec![ + Label::primary(file_id, terminal.span().byte_range()).with_message( + "This terminal symbol is defined as both of %left and %right", + ), + Label::secondary(file_id, old_range) + .with_message(format!("was set as {} here", old_string)), + Label::secondary(file_id, new_range) + .with_message(format!("was set as {} here", new_string)), + ]) + .with_notes(vec![ + "Reduce type must be unique, either %left or %right".to_string() + ]) + } + + ParseError::TermNonTermConflict { + name, + terminal, + non_terminal, + } => { + let range = name.span().byte_range(); + + Diagnostic::error() + .with_message("Ambiguous token name") + .with_labels(vec![ + Label::primary(file_id, range).with_message( + "This name is used for both terminal and non-terminal", + ), + Label::secondary(file_id, terminal.span().byte_range()) + .with_message("Terminal definition here"), + Label::secondary(file_id, non_terminal.span().byte_range()) + .with_message("Non-terminal definition here"), + ]) + .with_notes(vec![ + "Terminal and non-terminal name must be unique".to_string() + ]) + } + + ParseError::InvalidTerminalRange( + (first, first_index, first_stream), + (last, last_index, last_stream), + ) => { + let range1 = first.span().byte_range(); + let range2 = last.span().byte_range(); + let range = range1.start..range2.end; + let range1 = utils::tokenstream_range(first_stream); + let range2 = utils::tokenstream_range(last_stream); + + Diagnostic::error() + .with_message("Invalid terminal range") + .with_labels(vec![ + Label::primary(file_id, range).with_message("Invalid range here"), + Label::secondary(file_id, range1).with_message(format!("First terminal symbol (index {})", first_index)), + Label::secondary(file_id, range2).with_message(format!("Last terminal symbol (index {})", last_index)), + ]).with_notes(vec![ + "First terminal symbol has to be less than or equal to the last terminal symbol".to_string() + ]) + } + + ParseError::StartNonTerminalNotDefined(ident) => { + let range = ident.span().byte_range(); + + Diagnostic::error() + .with_message("Start non-terminal not defined") + .with_labels(vec![Label::primary(file_id, range) + .with_message("This name is given to %start")]) + .with_notes(vec!["Non-terminal name must be defined".to_string()]) + } + + ParseError::TerminalNotDefined(ident) => { + let range = ident.span().byte_range(); + + Diagnostic::error() + .with_message("Terminal symbol not defined") + .with_labels(vec![Label::primary(file_id, range) + .with_message("This terminal symbol is not defined")]) + .with_notes(vec!["Terminal symbol must be defined".to_string()]) + } + + ParseError::MultipleTokenDefinition(ident1, ident2) => { + let range1 = ident1.span().byte_range(); + let range2 = ident2.span().byte_range(); + + Diagnostic::error() + .with_message("Multiple %token definition") + .with_labels(vec![ + Label::primary(file_id, range1).with_message("First definition"), + Label::primary(file_id, range2).with_message("Other definition"), + ]) + .with_notes(vec!["Token name must be unique".to_string()]) + } + + ParseError::EofDefined(ident) => { + let range = ident.span().byte_range(); + + Diagnostic::error() + .with_message("'eof' is reserved name") + .with_labels(vec![Label::primary(file_id, range) + .with_message("This name is reserved")]) + } + ParseError::AugmentedDefined(ident) => { + let range = ident.span().byte_range(); + + Diagnostic::error() + .with_message("'Augmented' is reserved name") + .with_labels(vec![Label::primary(file_id, range) + .with_message("This name is reserved")]) + } + ParseError::ReservedName(ident) => { + let range = ident.span().byte_range(); + + Diagnostic::error() + .with_message(format!("'{}' is reserved name", ident)) + .with_labels(vec![Label::primary(file_id, range) + .with_message("This name is reserved")]) + } + _ => { + let message = e.short_message(); + let span = e.span().byte_range(); + Diagnostic::error() + .with_message(message) + .with_labels(vec![ + Label::primary(file_id, span).with_message("occured here") + ]) + } + }; + + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + + return Err(diag.message); + } + }; + + // expand macro + let expanded_stream = match grammar.emit_compiletime(self.lalr) { + Ok(expanded_stream) => expanded_stream, + Err(e) => { + let diag = match e.as_ref() { + EmitError::RuleTypeDefinedButActionNotDefined { + name, + rule_local_id, + } => { + // `name` must not be generated rule, + // since it is programmically generated, it must have a proper reduce action + + let rule_line = + &grammar.rules.get(name).unwrap().rule_lines[*rule_local_id]; + let rule_line_range = if rule_line.tokens.is_empty() { + rule_line.separator_span.byte_range() + } else { + let first = rule_line.separator_span.byte_range().start; + let last = rule_line.tokens.last().unwrap().end_span.byte_range().end; + first..last + }; + Diagnostic::error() + .with_message("Reduce action not defined") + .with_labels(vec![ + Label::secondary(file_id, name.span().byte_range()) + .with_message("This rule has a type definition"), + Label::primary(file_id, rule_line_range) + .with_message("This rule line has no reduce action"), + ]) + .with_notes(vec!["".to_string()]) + } + + EmitError::ShiftReduceConflict { + term, + reduce_rule: (reduceid, reduce_production_rule), + shift_rules, + } => { + let mut message = format!( + "Shift/Reduce conflict:\nReduce rule:\n\t>>> {}\nShift rules:", + reduce_production_rule + ); + for (_, shifted_rule) in shift_rules.iter() { + message.push_str(format!("\n\t>>> {}", shifted_rule).as_str()); + } + + let (name, rules, rule) = + grammar.get_rule_by_id(*reduceid).expect("Rule not found"); + let mut labels = Vec::new(); + + if !name + .to_string() + .starts_with(rusty_lr_parser::utils::AUTO_GENERATED_RULE_PREFIX) + { + let (rule_begin, rule_end) = rules.rule_lines[rule].span_pair(); + let rule_range = + rule_begin.byte_range().start..rule_end.byte_range().end; + + labels.push( + Label::primary(file_id, name.span().byte_range()) + .with_message(format!("Reduce rule {} was defined here", name)), + ); + labels.push( + Label::secondary(file_id, rule_range) + .with_message("in this line".to_string()), + ); + } else { + let origin_span = grammar + .generated_root_span + .get(name) + .expect("generated_root_span::rule not found"); + let origin_range = + origin_span.0.byte_range().start..origin_span.1.byte_range().end; + labels.push( + Label::primary(file_id, origin_range).with_message(format!( + "Reduce rule {} was generated here", + name + )), + ); + } + + for (shiftid, shift_rule) in shift_rules.iter() { + let (name, rules, rule) = + grammar.get_rule_by_id(*shiftid).expect("Rule not found"); + if !name + .to_string() + .starts_with(rusty_lr_parser::utils::AUTO_GENERATED_RULE_PREFIX) + { + let first_shift_token_byte = rules.rule_lines[rule].tokens + [shift_rule.shifted] + .begin_span + .byte_range() + .start; + let (_, rule_end) = rules.rule_lines[rule].span_pair(); + let rule_range = first_shift_token_byte..rule_end.byte_range().end; + labels.push( + Label::primary(file_id, name.span().byte_range()).with_message( + format!("Shift rule {} was defined here", name), + ), + ); + labels.push( + Label::secondary(file_id, rule_range) + .with_message("in this line".to_string()), + ); + } else { + let origin_span = grammar + .generated_root_span + .get(name) + .expect("generated_root_span::rule not found"); + let origin_range = origin_span.0.byte_range().start + ..origin_span.1.byte_range().end; + labels.push(Label::secondary(file_id, origin_range).with_message( + format!("Shift rule {} was generated here", name), + )); + } + } + Diagnostic::error() + .with_message(message) + .with_labels(labels) + .with_notes(vec![ + format!("conflict terminal: {}", term), + format!( + "Try to rearrange the rules or resolve conflict by set reduce type" + ), + format!(">>> %left {}", term), + format!(">>> %right {}", term), + ]) + } + EmitError::ReduceReduceConflict { + lookahead, + rule1: (ruleid1, production_rule1), + rule2: (ruleid2, production_rule2), + } => { + let (name1, rules1, rule1) = + grammar.get_rule_by_id(*ruleid1).expect("Rule not found 1"); + let (rule1_begin, rule1_end) = rules1.rule_lines[rule1].span_pair(); + let rule_range1 = + rule1_begin.byte_range().start..rule1_end.byte_range().end; + let (name2, rules2, rule2) = + grammar.get_rule_by_id(*ruleid2).expect("Rule not found 2"); + let (rule2_begin, rule2_end) = rules2.rule_lines[rule2].span_pair(); + let rule_range2 = + rule2_begin.byte_range().start..rule2_end.byte_range().end; + + let mut labels = Vec::new(); + + // no byte range for auto generated rules + if !name1 + .to_string() + .starts_with(rusty_lr_parser::utils::AUTO_GENERATED_RULE_PREFIX) + { + labels.push( + Label::primary(file_id, name1.span().byte_range()) + .with_message(format!("{} was defined here", name1)), + ); + labels.push( + Label::secondary(file_id, rule_range1) + .with_message("in this line".to_string()), + ); + } else { + let origin_span = grammar + .generated_root_span + .get(name1) + .expect("generated_root_span::rule not found"); + let origin_range = + origin_span.0.byte_range().start..origin_span.1.byte_range().end; + + labels.push( + Label::primary(file_id, origin_range) + .with_message(format!("{} was generated here", name1)), + ); + } + if !name2 + .to_string() + .starts_with(rusty_lr_parser::utils::AUTO_GENERATED_RULE_PREFIX) + { + labels.push( + Label::primary(file_id, name2.span().byte_range()) + .with_message(format!("{} was defined here", name2)), + ); + labels.push( + Label::secondary(file_id, rule_range2) + .with_message("in this line".to_string()), + ); + } else { + let origin_span = grammar + .generated_root_span + .get(name2) + .expect("generated_root_span::rule not found"); + let origin_range = + origin_span.0.byte_range().start..origin_span.1.byte_range().end; + + labels.push( + Label::primary(file_id, origin_range) + .with_message(format!("{} was generated here", name2)), + ); + } + Diagnostic::error() + .with_message(format!( + "Reduce/Reduce conflict:\n>>> {}\n>>> {}", + production_rule1, production_rule2 + )) + .with_labels(labels) + .with_notes(vec![format!("with lookahead {}", lookahead)]) + } + + _ => { + let message = e.short_message(); + let span = e.span().byte_range(); + Diagnostic::error() + .with_message(message) + .with_labels(vec![ + Label::primary(file_id, span).with_message("occured here") + ]) + } + }; + + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + + return Err(diag.message); + } + }; + + // this comments will be printed to the output file + // build again here whether it was built before + // since many informations are removed in the rusty_lr_parser output + let mut debug_comments = String::new(); + { + let mut builder = grammar.create_grammar(); + debug_comments.push_str(format!("{:=^80}\n", "Grammar").as_str()); + for rule in builder.rules.iter() { + debug_comments.push_str(format!("{}\n", rule).as_str()); + } + let parser = if self.lalr { + match builder.build_lalr(Ident::new( + rusty_lr_parser::utils::AUGMENTED_NAME, + Span::call_site(), + )) { + Ok(parser) => parser, + Err(_) => unreachable!("Grammar building failed"), + } + } else { + match builder.build(Ident::new( + rusty_lr_parser::utils::AUGMENTED_NAME, + Span::call_site(), + )) { + Ok(parser) => parser, + Err(_) => unreachable!("Grammar building failed"), + } + }; + + // print note about generated rules + if self.verbose { + let mut rules_on_same_root = BTreeMap::new(); + + // `generated_root_span` contains only auto-generated rules + for (rule_name, root_span) in grammar.generated_root_span.iter() { + let start = root_span.0.byte_range().start; + let end = root_span.1.byte_range().end; + rules_on_same_root + .entry((start, end)) + .or_insert_with(Vec::new) + .push(rule_name); + } + + for (root_range, rules) in rules_on_same_root.into_iter() { + let mut rules_string = String::new(); + for rule_name in rules.into_iter() { + let name_str = rule_name.to_string(); + let name_len = name_str.len(); + let front_padding = " ".repeat(name_len); + let rule_lines = grammar.rules.get(rule_name).expect("Rule not found"); + + for (idx, rule_line) in rule_lines.rule_lines.iter().enumerate() { + let mut line_string = String::new(); + for (idx, token) in rule_line.tokens.iter().enumerate() { + line_string.push_str(token.token.to_string().as_str()); + if idx < rule_line.tokens.len() - 1 { + line_string.push(' '); + } + } + + if idx == 0 { + rules_string.push_str( + format!("\n{} -> {}", name_str, line_string).as_str(), + ); + } else { + rules_string.push_str( + format!("\n{} | {}", front_padding, line_string).as_str(), + ); + } + } + rules_string.push_str(format!("\n{} ;", front_padding).as_str()); + } + + let message = format!("Auto-generated rules:{}", rules_string); + let diag = Diagnostic::note() + .with_message(message) + .with_labels(vec![Label::primary(file_id, root_range.0..root_range.1) + .with_message("was generated here")]); + + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + } + + // about shift/reduce conflict resolving + for state in parser.states.iter() { + let mut reduce_rules = BTreeMap::new(); + let mut shift_rules = BTreeMap::new(); + + for (shifted_rule_ref, lookaheads) in state.ruleset.rules.iter() { + // is end of rule, add to reduce + if shifted_rule_ref.shifted + == builder.rules[shifted_rule_ref.rule].rule.len() + { + for token in lookaheads.iter() { + reduce_rules.insert(token, shifted_rule_ref.rule); + } + } + + // if it is not end, and next token is terminal, add to shift + if let Some(rusty_lr_core::Token::Term(token)) = builder.rules + [shifted_rule_ref.rule] + .rule + .get(shifted_rule_ref.shifted) + { + shift_rules + .entry(token) + .or_insert_with(BTreeSet::new) + .insert(*shifted_rule_ref); + } + } + + // check shift/reduce conflict + for (term, shift_rules) in shift_rules.into_iter() { + if let Some(reduce_rule) = reduce_rules.get(term) { + // shift/reduce conflict here + // since there were not error reaching here, 'term' must be set reduce_type + + let mut message = format!( + "Shift/Reduce conflict with token {} was resolved:\nReduce rule:\n>>> {}\nShift rules:", + term, + &builder.rules[*reduce_rule] + ); + for shifted_rule in shift_rules.iter() { + let shifted_rule = ShiftedRule { + rule: builder.rules[shifted_rule.rule].clone(), + shifted: shifted_rule.shifted, + }; + message.push_str(format!("\n>>> {}", shifted_rule).as_str()); + } + + let (name, rules, rule) = grammar + .get_rule_by_id(*reduce_rule) + .expect("Rule not found"); + let mut labels = Vec::new(); + + if !name + .to_string() + .starts_with(rusty_lr_parser::utils::AUTO_GENERATED_RULE_PREFIX) + { + let (rule_begin, rule_end) = rules.rule_lines[rule].span_pair(); + let rule_range = + rule_begin.byte_range().start..rule_end.byte_range().end; + labels.push( + Label::primary(file_id, name.span().byte_range()).with_message( + format!("Reduce rule {} was defined here", name), + ), + ); + labels.push( + Label::secondary(file_id, rule_range) + .with_message("in this line".to_string()), + ); + } else { + let origin_span = grammar + .generated_root_span + .get(name) + .expect("generated_root_span::rule not found"); + let origin_range = origin_span.0.byte_range().start + ..origin_span.1.byte_range().end; + labels.push(Label::primary(file_id, origin_range).with_message( + format!("Reduce rule {} was generated here", name), + )); + } + + for shift_rule in shift_rules.iter() { + let (name, rules, rule) = grammar + .get_rule_by_id(shift_rule.rule) + .expect("Rule not found"); + if !name + .to_string() + .starts_with(rusty_lr_parser::utils::AUTO_GENERATED_RULE_PREFIX) + { + let first_shift_token_byte = rules.rule_lines[rule].tokens + [shift_rule.shifted] + .begin_span + .byte_range() + .start; + let (_, rule_end) = rules.rule_lines[rule].span_pair(); + let rule_range = + first_shift_token_byte..rule_end.byte_range().end; + labels.push( + Label::primary(file_id, name.span().byte_range()) + .with_message(format!( + "Shift rule {} was defined here", + name + )), + ); + labels.push( + Label::secondary(file_id, rule_range) + .with_message("in this line".to_string()), + ); + } else { + let origin_span = grammar + .generated_root_span + .get(name) + .expect("generated_root_span::rule not found"); + let origin_range = origin_span.0.byte_range().start + ..origin_span.1.byte_range().end; + labels.push( + Label::secondary(file_id, origin_range).with_message( + format!("Shift rule {} was generated here", name), + ), + ); + } + } + + let reduce_type_origin = grammar + .reduce_types_origin + .get(term) + .expect("reduce_types_origin not found"); + let range = reduce_type_origin.0.byte_range().start + ..reduce_type_origin.1.byte_range().end; + let reduce_type = *grammar + .reduce_types + .get(term) + .expect("reduce_types not found"); + let type_string = match reduce_type { + rusty_lr_core::ReduceType::Left => "%left", + rusty_lr_core::ReduceType::Right => "%right", + }; + labels.push(Label::primary(file_id, range).with_message(format!( + "Reduce type was set as {} here", + type_string + ))); + + let diag = Diagnostic::note().with_message(message).with_labels(labels); + + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + term::emit(&mut writer.lock(), &config, &files, &diag) + .expect("Failed to write to stderr"); + } + } + } + } + } + + Ok(output::Output { + user_stream: output_stream, + generated_stream: expanded_stream, + debug_comments, + }) + } +} diff --git a/rusty_lr_buildscript/src/output.rs b/rusty_lr_buildscript/src/output.rs new file mode 100644 index 0000000..c9a7904 --- /dev/null +++ b/rusty_lr_buildscript/src/output.rs @@ -0,0 +1,10 @@ +use proc_macro2::TokenStream; + +pub struct Output { + /// token stream before '%%' + pub user_stream: TokenStream, + /// token stream after '%%' + pub generated_stream: TokenStream, + /// debug comments attatched to the output file + pub debug_comments: String, +} diff --git a/rusty_lr_buildscript/src/split.rs b/rusty_lr_buildscript/src/split.rs new file mode 100644 index 0000000..958fe92 --- /dev/null +++ b/rusty_lr_buildscript/src/split.rs @@ -0,0 +1,30 @@ +use proc_macro2::Spacing; +use proc_macro2::TokenStream; +use proc_macro2::TokenTree; +use quote::TokenStreamExt; + +// split stream by '%%' +pub fn split_stream(token_stream: TokenStream) -> Result<(TokenStream, TokenStream), ()> { + // input stream + let mut token_stream = token_stream.into_iter().peekable(); + + // before '%%' + let mut output_stream = TokenStream::new(); + + while let Some(token) = token_stream.next() { + if let TokenTree::Punct(token) = &token { + if token.as_char() == '%' && token.spacing() == Spacing::Joint { + if let Some(TokenTree::Punct(next)) = token_stream.peek() { + if next.as_char() == '%' && next.spacing() == Spacing::Alone { + token_stream.next(); + let macro_stream: TokenStream = token_stream.collect(); + return Ok((output_stream, macro_stream)); + } + } + } + } + output_stream.append(token); + } + + Err(()) +} diff --git a/rusty_lr_buildscript/src/utils.rs b/rusty_lr_buildscript/src/utils.rs new file mode 100644 index 0000000..87f687c --- /dev/null +++ b/rusty_lr_buildscript/src/utils.rs @@ -0,0 +1,23 @@ +use proc_macro2::Span; +use proc_macro2::TokenStream; + +use std::ops::Range; + +pub fn tokenstream_range(stream: TokenStream) -> Range { + if stream.is_empty() { + return 0..0; + } + let mut stream = stream.into_iter(); + let first = stream.next().unwrap().span().byte_range(); + let last = if let Some(last) = stream.last() { + last.span().byte_range() + } else { + first.clone() + }; + + first.start..last.end +} +pub fn span_stream_range(span: Span, stream: TokenStream) -> Range { + let stream_range = tokenstream_range(stream); + span.byte_range().start..stream_range.end +} diff --git a/rusty_lr_core/Cargo.toml b/rusty_lr_core/Cargo.toml index 0f0d792..baf7373 100644 --- a/rusty_lr_core/Cargo.toml +++ b/rusty_lr_core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rusty_lr_core" -version = "2.2.1" +version = "2.2.2" edition = "2021" license = "MIT" description = "core library for rusty_lr" diff --git a/rusty_lr_core/src/lib.rs b/rusty_lr_core/src/lib.rs index 43a1880..27cd6de 100644 --- a/rusty_lr_core/src/lib.rs +++ b/rusty_lr_core/src/lib.rs @@ -9,6 +9,7 @@ pub(crate) mod rule; pub(crate) mod state; pub(crate) mod token; +/// module for build DFA tables from CFG #[cfg(feature = "builder")] pub mod builder;