boseiju/
parser.rs

1mod error;
2mod node;
3mod rule_map;
4mod rules;
5
6pub use error::ParserError;
7pub use node::ParserNode;
8
9use std::sync::Arc;
10
11use rapidhash::HashMapExt;
12use rapidhash::HashSetExt;
13type HashSet<V> = rapidhash::RapidHashSet<V>;
14type HashMap<K, V> = rapidhash::RapidHashMap<K, V>;
15
16#[derive(Clone)]
17enum EarleyBackpointer<'r> {
18    Scanned(usize),
19    Complete(Arc<EarleyItem<'r>>),
20}
21
22/// Earley Item.
23///
24/// From the Earley algorithm, an Earley item is an object that contains:
25/// - A rule to match for. This item is responsible for trying to match that rule
26/// on the given input stream, at the given start position.
27/// - A start index, this is where the item started to match the tokens with
28/// the given rule.
29/// - a position index, indicating how many of the rule tokens have been matched already.
30///
31/// The rule is generally noted `(A -> a b . c, 2)` where the dot represent the position index.
32/// In this example, the rule is `A -> a b c`, and we already matched `a b` from the token 2 and 3
33/// of the input stream, and we are expecting a token `c` from the stream.
34///
35/// The debug implementation of the rule display it with this format.
36#[derive(Clone)]
37struct EarleyItem<'r> {
38    pub rule: &'r rules::ParserRule,
39    pub start_index: usize,
40    pub position_index: usize,
41    pub backpointers: Vec<EarleyBackpointer<'r>>,
42}
43
44impl<'r> EarleyItem<'r> {
45    /// Construct a new Earley item with no backpointers.
46    fn new(rule: &'r rules::ParserRule, start_index: usize, position_index: usize) -> Self {
47        Self {
48            rule,
49            start_index,
50            position_index,
51            backpointers: Vec::with_capacity(rule.expanded.length.get()),
52        }
53    }
54
55    /// This function gets the token that this rule is awaiting for progress.
56    ///
57    /// For this item of the form (A -> a . B b, i) get the B token id.
58    fn expecting_token(&self) -> Option<usize> {
59        self.rule.expanded.get(self.position_index).cloned()
60    }
61
62    /// Checks wheteher this item is of the form (A -> a . , i).
63    /// In other terms, whether this rule fully matched the input or not.
64    fn rule_complete(&self) -> bool {
65        self.rule.expanded.length.get() == self.position_index
66    }
67
68    /// Use the item rule to reduce the parser nodes.
69    ///
70    /// If the item contains any backpointers, they will be used to recursively
71    /// call the rules required to merge everything together.
72    fn reduce(&self, nodes: &[ParserNode]) -> Result<ParserNode, ParserError> {
73        let rule_token_count = self.rule.expanded.length.get();
74
75        let mut tokens_for_reduction = Vec::with_capacity(rule_token_count);
76        for backpointer in self.backpointers.iter().take(rule_token_count) {
77            match backpointer {
78                EarleyBackpointer::Scanned(token_index) => tokens_for_reduction.push(nodes[*token_index].clone()),
79                EarleyBackpointer::Complete(backpointer) => tokens_for_reduction.push(backpointer.reduce(nodes)?),
80            }
81        }
82
83        match (self.rule.reduction)(&tokens_for_reduction) {
84            Ok(node) => Ok(node),
85            Err(merge_error) => Err(ParserError::FailedToApplyRule {
86                merge_error,
87                for_rule: self.rule.creation_loc.clone(),
88            }),
89        }
90    }
91}
92
93impl<'r> std::fmt::Debug for EarleyItem<'r> {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        use idris::Idris;
96        write!(f, "({} -> ", ParserNode::name_from_id(self.rule.merged))?;
97        for i in 0..self.position_index {
98            write!(f, "{} ", ParserNode::name_from_id(self.rule.expanded[i]))?;
99        }
100        write!(f, ".")?;
101        for i in self.position_index..self.rule.expanded.length.get() {
102            write!(f, " {}", ParserNode::name_from_id(self.rule.expanded[i]))?;
103        }
104        write!(f, ", {})", self.start_index)?;
105        Ok(())
106    }
107}
108
109impl<'r> PartialEq for EarleyItem<'r> {
110    fn eq(&self, other: &Self) -> bool {
111        self.rule == other.rule && self.start_index == other.start_index && self.position_index == other.position_index
112    }
113}
114
115impl<'r> Eq for EarleyItem<'r> {}
116
117impl<'r> std::hash::Hash for EarleyItem<'r> {
118    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
119        self.rule.hash(state);
120        self.start_index.hash(state);
121        self.position_index.hash(state);
122    }
123}
124
125/// Earley row.
126///
127/// An Earley row is a set of [`EarleyItem`] that are stored at a single row
128/// in the Early algorithm. That is, any T\[i\] is an Earley row.
129///
130/// The core idea of the algorithm is that in the Earley table at index j,
131/// the Earley row j contains all the rules that are potential matches for the
132/// incoming token stream.
133#[derive(Clone)]
134struct EarleyRow<'r> {
135    /// Storage for all items that are completed.
136    /// That is, there is no awaiting token.
137    /// They are stored separatly for quick access in the algorithm.
138    pub completed_items: HashSet<Arc<EarleyItem<'r>>>,
139    /// Storage for all items awaiting a node. They are stored in a
140    /// HashMap where the awaited node id is the key, for easy access.
141    pub uncompleted_items: HashMap<usize, HashSet<Arc<EarleyItem<'r>>>>,
142}
143
144impl<'r> EarleyRow<'r> {
145    /// Creates a new, empty row.
146    fn new() -> Self {
147        Self {
148            completed_items: HashSet::new(),
149            uncompleted_items: HashMap::new(),
150        }
151    }
152
153    /// Create the start row of the Earley Table for an algorithm that is targetting
154    /// the Ability Tree node.
155    fn start_row(rules: &'static rule_map::RuleMap) -> EarleyRow<'static> {
156        use crate::utils::dummy;
157        use idris::Idris;
158
159        let mut start_row = EarleyRow::new();
160
161        let target_node = ParserNode::AbilityTree { tree: dummy() };
162        let node_id = target_node.id();
163
164        let mut queue: Vec<Arc<_>> = Vec::new();
165
166        if let Some(rules) = rules.get_rules_for_token(node_id) {
167            for rule in rules {
168                let item = Arc::new(EarleyItem::new(rule, 0, 0));
169                start_row.insert(item.clone());
170                queue.push(item.clone());
171            }
172        }
173
174        while let Some(item) = queue.pop() {
175            if let Some(new_items) = predictor_step(&rules, 0, &item) {
176                for new_item in new_items {
177                    let item = Arc::new(new_item.clone());
178                    if start_row.insert(item.clone()) {
179                        queue.push(item.clone());
180                    }
181                }
182            }
183        }
184
185        start_row
186    }
187
188    fn insert(&mut self, item: Arc<EarleyItem<'r>>) -> bool {
189        match item.expecting_token() {
190            None => {
191                /* if no tokens are expected, the rule is complete */
192                self.completed_items.insert(item)
193            }
194            Some(expecting) => {
195                /* Otherwise, we are expecting a token */
196                let row = self.uncompleted_items.entry(expecting).or_default();
197                row.insert(item)
198            }
199        }
200    }
201
202    fn is_empty(&self) -> bool {
203        self.completed_items.is_empty() && self.uncompleted_items.is_empty()
204    }
205}
206
207impl<'r> std::fmt::Debug for EarleyRow<'r> {
208    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
209        if self.is_empty() {
210            write!(f, "[]")?;
211        } else {
212            writeln!(f, "[")?;
213            for item in self.completed_items.iter() {
214                writeln!(f, "  {item:?}")?;
215            }
216            for items in self.uncompleted_items.values() {
217                for item in items.iter() {
218                    writeln!(f, "  {item:?}")?;
219                }
220            }
221            write!(f, "]")?;
222        }
223        Ok(())
224    }
225}
226
227/// Earley Table.
228///
229/// The Earley table is the object constructed with the Earley parsing algorithm.
230/// It contains as many rows as there are tokens (plus one) and keep track of the rules
231/// that can match the provided input stream.
232#[derive(Clone)]
233struct EarleyTable<'r> {
234    pub table: Vec<EarleyRow<'r>>,
235}
236
237impl<'r> EarleyTable<'r> {
238    pub fn new(node_count: usize, start_row: EarleyRow<'r>) -> Self {
239        let mut table = Vec::with_capacity(node_count + 1);
240        table.push(start_row);
241
242        Self { table }
243    }
244}
245
246impl<'r> std::fmt::Debug for EarleyTable<'r> {
247    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
248        for (i, row) in self.table.iter().enumerate() {
249            writeln!(f, "T[{i}] = {row:?}")?;
250        }
251        Ok(())
252    }
253}
254
255impl<'r> std::ops::Deref for EarleyTable<'r> {
256    type Target = [EarleyRow<'r>];
257    fn deref(&self) -> &Self::Target {
258        self.table.as_slice()
259    }
260}
261
262/// Actual implementation of the Earley algorithm.
263fn parse_impl(tokens: &[crate::lexer::tokens::Token]) -> Result<crate::AbilityTree, error::ParserError> {
264    use crate::utils::dummy;
265    use idris::Idris;
266
267    /* Rule map, all our parsing rules in a single struct */
268    lazy_static::lazy_static!(
269        /// The rule map contains all the rules to parse the MTG cards.
270        static ref rules: rule_map::RuleMap = rule_map::RuleMap::default().expect("Default Rule Map shall be OK");
271
272        /// A first row for the earley parsing algorithm with the target of parsing a full ability tree.
273        ///
274        /// Since the row depends only on the rule map, we can create a static instance of
275        /// it and clone it whenever we start a new parsing, instead of rebuilding it each time.
276        ///
277        /// Fixme: I think we can construct it each time it's fine, and it gives us control over the target
278        static ref earley_start_row: EarleyRow<'static> = EarleyRow::start_row(&rules);
279    );
280
281    /* Implementation of the Earley parser */
282    let target_node_id = ParserNode::AbilityTree { tree: dummy() }.id();
283    let nodes: Vec<ParserNode> = tokens.iter().cloned().map(ParserNode::from).collect();
284
285    /* First step: init the table row 0 with all rules that can create the final token */
286    let node_count = nodes.len();
287    let mut earley_table = EarleyTable::new(node_count, earley_start_row.clone());
288
289    for (node_index, node) in nodes.iter().enumerate() {
290        use idris::Idris;
291        let node_id = node.id();
292        let j = node_index + 1;
293
294        /* Create the next Earley table entry, T[j]  */
295        let mut next_table_row = EarleyRow::new();
296
297        /* The queue allows to process each item once, seeing what other items are added */
298        let mut queue = Vec::new();
299
300        /* Scanner step */
301        /* for all items in the previous row, add them to this row if they match the current token */
302        if let Some(new_items) = scanner_step(&earley_table.table[node_index], node_id, node_index) {
303            for new_item in new_items {
304                let new_item = Arc::new(new_item);
305                queue.push(new_item.clone());
306                next_table_row.insert(new_item.clone());
307            }
308        }
309
310        /* Saturate Predictor + Completor steps */
311        while let Some(item) = queue.pop() {
312            /* Predictor step */
313            if let Some(new_items) = predictor_step(&rules, j, &item) {
314                for new_item in new_items {
315                    /* Add the new item in the queue for eploring only if it is unknown (not in the row) */
316                    if next_table_row.insert(Arc::new(new_item.clone())) {
317                        queue.push(Arc::new(new_item.clone()));
318                    }
319                }
320            }
321            /* Completor step */
322            if let Some(new_items) = completor_step(&earley_table, item) {
323                for new_item in new_items {
324                    /* Add the new item in the queue for eploring only if it is unknown (not in the row) */
325                    if next_table_row.insert(Arc::new(new_item.clone())) {
326                        queue.push(Arc::new(new_item.clone()));
327                    }
328                }
329            }
330        }
331
332        earley_table.table.push(next_table_row);
333    }
334
335    /* Look for an item of kind (S -> a . , 0) for parser completion */
336    let completed_items = earley_table.table[node_count]
337        .completed_items
338        .iter()
339        .filter(|item| item.rule.merged == target_node_id)
340        .collect::<Vec<_>>();
341
342    match completed_items.as_slice() {
343        /* No item completed, create a parse error from the earley table */
344        &[] => Err(error::ParserError::from_earley_table(&earley_table, tokens)),
345        /* A single item is complete: we have a condidate for merging */
346        &[complete_item] => match complete_item.reduce(&nodes)? {
347            ParserNode::AbilityTree { tree } => return Ok(tree),
348            _ => unreachable!(),
349        },
350        _ => Err(ParserError::AmbiguousCandidates),
351    }
352}
353
354/// Scanner step of the Earley Algorithm.
355///
356/// The goal of this step is to consume a new token from the input token stream,
357/// and feed it to all the rules that are current matches to advance those rules.
358///
359/// All the rules in the row T\[j-1\] that were expecting the token are bumped
360/// into T\[j\] with their position index advanced.
361///
362/// The awaited non terminals are handled in the predictor step.
363fn scanner_step<'r>(prev_row: &EarleyRow<'r>, token: usize, token_index: usize) -> Option<impl Iterator<Item = EarleyItem<'r>>> {
364    Some(prev_row.uncompleted_items.get(&token)?.iter().map(move |prev_item| {
365        use std::ops::Deref;
366
367        let mut new_item: EarleyItem = prev_item.deref().clone();
368        new_item.position_index += 1;
369        new_item.backpointers.push(EarleyBackpointer::Scanned(token_index));
370
371        new_item
372    }))
373}
374
375/// Predictor step of the Earley Algorithm.
376///
377/// The goal of this step is to satisfy all the rules that are currently awaiting a non terminal token.
378/// Since the scanner step can only feed terminals, for each awaited non terminal in a rule,
379/// we add that rule to the current row, with the start position set at the current index.
380///
381/// Then, the rule completion logic is handled in the completor step.
382///
383/// This function returns whether new items were added to the next row.
384fn predictor_step<'r>(
385    rules: &'r rule_map::RuleMap,
386    j: usize,
387    item: &EarleyItem<'r>,
388) -> Option<impl Iterator<Item = EarleyItem<'r>>> {
389    let next_token = item.expecting_token()?;
390    Some(
391        rules
392            .get_rules_for_token(next_token)?
393            .map(move |rule| EarleyItem::new(rule, j, 0)),
394    )
395}
396
397/// Completor step of the Earley Algorithm.
398///
399/// If we complete a rule in the current row, we need to look back to all the rows
400/// that were awaiting this rule to complete to advance it.
401///
402/// For a given rule (A -> a . , i) that is compleated, we can look in T[\i\] (since that
403/// rule started at i) for any rule that was awaiting for an A non terminal, and advance
404/// it in the current row.
405///
406/// This function returns whether new items were added to the next row.
407fn completor_step<'r>(
408    earley_table: &EarleyTable<'r>,
409    completed_item: Arc<EarleyItem<'r>>,
410) -> Option<impl Iterator<Item = EarleyItem<'r>>> {
411    if completed_item.rule_complete() {
412        /* If the itme is completed, check in T[i] for all rules awaiting this item */
413        let awaiting_row = &earley_table.table[completed_item.start_index];
414        let awaiting_items = awaiting_row.uncompleted_items.get(&completed_item.rule.merged)?;
415        Some(awaiting_items.iter().map(move |awaiting_item| {
416            /* And we can bump these rules up */
417            use std::ops::Deref;
418
419            let mut new_item: EarleyItem = awaiting_item.deref().clone();
420            new_item.position_index += 1;
421            /* Set the backpointer, since the current row item validated this token */
422            new_item
423                .backpointers
424                .push(EarleyBackpointer::Complete(completed_item.clone()));
425
426            new_item
427        }))
428    } else {
429        None
430    }
431}
432
433/// Entry point of the parsing algorithm.
434///
435/// Attempts to parse a sequence of nodes into an ability tree, using the Earley parsing algorithm.
436///
437/// The algorithm reference can be found here: <https://en.wikipedia.org/wiki/Earley_parser>
438/// The algorithm used for the implementation was: <https://fr.wikipedia.org/wiki/Analyse_Earley> (cocorico)
439pub fn parse(tokens: &[crate::lexer::tokens::Token]) -> Result<crate::AbilityTree, error::ParserError> {
440    parse_impl(tokens)
441}