boseiju/
lexer.rs

1mod error;
2mod span;
3pub mod tokens;
4
5pub use error::LexerError;
6pub use span::Span;
7pub use tokens::IntoToken;
8
9/// Preprocess a card oracle text to properly lex it.
10pub fn preprocess(card_name: &str, oracle_text: &str) -> String {
11    let card_name = card_name.to_lowercase();
12    let result = oracle_text.to_lowercase();
13
14    /* replace all raw unicode char points by they values */
15    lazy_static::lazy_static!(
16        static ref unicode_regex: regex::Regex = regex::Regex::new("\\\\u(\\d{4})")
17            .expect("Failed to compile unicode character point regex");
18    );
19    let replacement = |cap: &regex::Captures| -> String {
20        let (_, [point]) = cap.extract();
21        let point = u32::from_str_radix(point, 16).expect("Regex matched a non valid u32!");
22        let ch = char::from_u32(point).expect("Regex matched a non valid unicode point!");
23        ch.to_string()
24    };
25    let result = unicode_regex.replace_all(&result, &replacement);
26
27    /* Use lowercase for parsing */
28    let result = result.to_ascii_lowercase();
29
30    /* Actual text modifications preprocessing */
31    let result = remove_comments(&result);
32    let result = replace_name(&card_name, &result);
33    let result = result.replace("\\n", "\n");
34
35    result
36}
37
38/// Remove all text within parenthesis in the given source, and returns the newly built string.
39fn remove_comments(input: &str) -> String {
40    let mut chars = input.chars();
41    let mut result = String::with_capacity(input.len());
42
43    while let Some(char) = chars.next() {
44        match char {
45            '(' => remove_parens(&mut chars),
46            c => result.push(c),
47        }
48    }
49
50    result
51}
52
53/// Pop chars from the given iterator until a closing parens is popped.
54/// If an oppening parens is popped, will call itelsef recursively.
55fn remove_parens<I: Iterator<Item = char>>(chars: &mut I) {
56    loop {
57        match chars.next() {
58            Some(')') => break,
59            Some('(') => remove_parens(chars),
60            _ => { /* keep popping */ }
61        }
62    }
63}
64
65fn replace_name(card_name: &str, lowercase_oracle_text: &str) -> String {
66    let card_name_lowercase = card_name.to_ascii_lowercase();
67    let card_name_without_epithet = card_name_lowercase.split(',').next();
68
69    let result = lowercase_oracle_text;
70    let result = result.replace(&card_name_lowercase, "~");
71    let result = match card_name_without_epithet {
72        Some(card_name) => result.replace(card_name, "~"),
73        None => result,
74    };
75
76    result
77}
78
79/// Create a vec of Terminals from a string. Can fail, and will return an error if it does.
80pub fn lex(input: &str) -> Result<Vec<tokens::Token>, error::LexerError> {
81    lazy_static::lazy_static!(
82        static ref raw_token_regex: regex::Regex = {
83            /* List of non words token we also want to match */
84            const MATCHABLE_NON_WORDS: &[&'static str] = &[
85                "\\.", ",", "'", "{", "}", "~", "\\/", ":", "+", "\\-", "—", "•", "\n",
86            ];
87            let matchable_non_words: String = MATCHABLE_NON_WORDS.iter().cloned().collect();
88            let raw_token_pattern = format!("(\\b\\w+\\b)|([{}])", matchable_non_words);
89            regex::Regex::new(&raw_token_pattern).expect("Failed to compile regex!")
90        };
91    );
92
93    let mut raw_tokens: std::collections::VecDeque<_> = raw_token_regex.find_iter(input).collect();
94
95    let mut result = Vec::new();
96
97    'outer: while !raw_tokens.is_empty() {
98        /* Attempt to parse as much tokens as possible, reducing by one each time */
99        for token_count in (0..raw_tokens.len()).rev() {
100            let start = raw_tokens[0].start();
101            let end = raw_tokens[token_count].end();
102            let span = span::Span {
103                start,
104                length: end - start,
105                text: &input[start..end],
106            };
107            /* Fix me: this is byte index, not character index ? may cause issues with the webdemo */
108            if let Some(token) = tokens::Token::try_from_span(span) {
109                raw_tokens.drain(0..token_count + 1);
110                result.push(token);
111                continue 'outer;
112            }
113        }
114        /* Failed to parse at all, stop the loop */
115        break;
116    }
117
118    if raw_tokens.is_empty() {
119        Ok(result)
120    } else {
121        let start = raw_tokens[0].start();
122        let end = raw_tokens[raw_tokens.len() - 1].end();
123        Err(error::LexerError::NoTokenMatch {
124            start: start,
125            end: end,
126            tokens: input[start..end].to_string(),
127        })
128    }
129}