diff --git a/crates/codebook/src/queries/c.scm b/crates/codebook/src/queries/c.scm index 9780926d..578065f6 100644 --- a/crates/codebook/src/queries/c.scm +++ b/crates/codebook/src/queries/c.scm @@ -7,9 +7,11 @@ (type_definition declarator: (type_identifier) @identifier.type) (struct_specifier - name: (type_identifier) @identifier.type) + name: (type_identifier) @identifier.type + body: _) (union_specifier - name: (type_identifier) @identifier.type) + name: (type_identifier) @identifier.type + body: _) (field_declaration declarator: (field_identifier) @identifier.field) (pointer_declarator @@ -17,7 +19,8 @@ (array_declarator declarator: (field_identifier) @identifier.field) (enum_specifier - name: (type_identifier) @identifier.type) + name: (type_identifier) @identifier.type + body: _) (enumerator name: (identifier) @identifier.constant) (init_declarator diff --git a/crates/codebook/tests/examples/example.c.in b/crates/codebook/tests/examples/example.c.in new file mode 100644 index 00000000..57d08d26 --- /dev/null +++ b/crates/codebook/tests/examples/example.c.in @@ -0,0 +1,44 @@ +int @@calculatr@@(int @@numbr@@1, int @@numbr@@2, char @@operashun@@) { + // This is an @@exampl@@ function that @@performz@@ @@calculashuns@@ + int @@resalt@@ = 0; + int @@misspellled@@; + return resalt + misspellled; +} + +#define @@MACROCONST@@ 3 +#define @@MACROFUNC@@(@@macroparam@@) macroparam + 1 + +typedef int @@Mispelll@@; + +union @@myunion@@ { int int_val; }; + +enum @@Colrs@@ { @@Grean@@ }; + +struct User@@Accaunt@@ { + char* @@usrrnamee@@; + int @@ballancee@@; + float @@intrest@@_rate; +}; + +void foo() { + int @@arrayy@@[3]; + int* @@pointerr@@; + int* @@pointerrarray@@[3]; + enum Colrs @@colorr@@; + union myunion @@unionn@@; + struct UserAccaunt @@userr@@; +} + +void bar() { + int @@arrayyy@@[3] = {}; + int* @@pointerrr@@ = NULL; + int* @@pointerrarrayy@@[3] = {}; + enum Colrs @@colorrr@@ = Grean; + union myunion @@unionnn@@ = {}; + struct UserAccaunt @@userrr@@ = {}; + + const char* str1 = "@@aaaa@@ @@bbbb@@"; + str1 = "@@cccc@@" "valid string" "@@dddd@@"; + printf("I'm a multi line @@stringg@@\n" + "@@withh@@\n@@yyy@@"); +} diff --git a/crates/codebook/tests/languages/assert_helpers.rs b/crates/codebook/tests/languages/assert_helpers.rs new file mode 100644 index 00000000..e7623540 --- /dev/null +++ b/crates/codebook/tests/languages/assert_helpers.rs @@ -0,0 +1,156 @@ +use codebook::parser::{TextRange, WordLocation}; +use std::collections::HashMap; + +pub struct ExpectedMisspellingsResult { + /// The full body of text to be used in spell checking, with + /// the start/end delimiters removed. + pub content: String, + /// List of misspellings, sorted lexicographically by word. + pub misspellings: Vec, +} + +/// Finds all words in the supplied `raw_content` between the specified +/// start/end delimiters. The words inside the delimiters represent words +/// that should be considered misspelled. +pub fn get_marked_misspellings( + raw_content: &str, + start_delimiter: &str, + end_delimiter: &str, +) -> ExpectedMisspellingsResult { + let mut cursor = 0; + let mut content = String::new(); + let mut misspelled_words: HashMap<&str, WordLocation> = HashMap::new(); + let mut end_index = 0; + + // Find first instance of the start delimiter starting at the cursor. + while let Some(start_offset) = raw_content[cursor..].find(start_delimiter) { + let start_index = start_offset + cursor + start_delimiter.len(); + + // Next, look for a matching end delimiter, exiting if there isn't one. + let Some(end_offset) = raw_content[start_index..].find(end_delimiter) else { + break; + }; + + end_index = start_index + end_offset; + let word = &raw_content[start_index..end_index]; + + // Compute the start and end bytes in the content string (not + // the raw content, which has the extra delimiters). + content += &raw_content[cursor..start_index - start_delimiter.len()]; + let start_byte = content.len(); + content += word; + let end_byte = content.len(); + + let range = TextRange { + start_byte, + end_byte, + }; + + if let Some(word_info) = misspelled_words.get_mut(word) { + word_info.locations.push(range); + } else { + misspelled_words.insert(word, WordLocation::new(word.to_string(), vec![range])); + } + + cursor = end_index + end_delimiter.len(); + } + + // Add rest of the raw content after the final end delimiter. + content += &raw_content[end_index + end_delimiter.len()..]; + + let mut misspellings: Vec = misspelled_words.into_values().collect(); + misspellings.sort_by(|w1, w2| w1.word.cmp(&w2.word)); + + ExpectedMisspellingsResult { + content, + misspellings, + } +} + +/// Checks that two sorted sequences of WordLocations are equal, panicking with +/// helpful debug information on failure. +#[macro_export] +macro_rules! assert_word_locations_match { + ($actual:expr, $expected:expr) => {{ + let actual_val = $actual; + let expected_val = $expected; + let actual_words = actual_val.iter().map(|w| w.word.as_str()); + let expected_words = expected_val.iter().map(|w| w.word.as_str()); + + // Warn the user if the lists of words are different. + if actual_val.len() != expected_val.len() { + panic!( + "word list mismatch: actual.len() = {}, expected.len() = {}\n\nactual words = {:?}\n\nexpected words = {:?}", + actual_val.len(), + expected_val.len(), + actual_words.collect::>(), + expected_words.collect::>() + ); + } + + // Otherwise, go word-by-word and fail if at the first error. + for (i, (a, e)) in actual_val.iter().zip(expected_val.iter()).enumerate() { + if a.word != e.word { + panic!( + "word mismatch at index {}:\n actual = {:#?}\n expected = {:#?}\n\n", + i, a, e + ); + } + + // Locations are not necessarily sorted by start byte, so + // sort them before comparison. + let mut a_loc = a.locations.clone(); + let mut e_loc = e.locations.clone(); + a_loc.sort_by(|l1, l2| l1.start_byte.cmp(&l2.start_byte)); + e_loc.sort_by(|l1, l2| l1.start_byte.cmp(&l2.start_byte)); + + if a_loc != e_loc { + panic!( + "location mismatch for \"{}\" at index {}:\n actual = {:#?}\n expected = {:#?}", + a.word, i, a_loc, e_loc + ); + } + } + }}; +} + +pub(crate) use assert_word_locations_match; + +#[test] +fn test_get_expected_misspellings_simple() { + let result = get_marked_misspellings(" ^a$ ^a$ a ^A$ ^^b$ ", "^", "$"); + + assert_eq!(result.content, " a a a A ^b "); + assert_eq!( + result.misspellings, + vec![ + WordLocation::new( + "A".to_string(), + vec![TextRange { + start_byte: 7, + end_byte: 8 + },] + ), + WordLocation::new( + "^b".to_string(), + vec![TextRange { + start_byte: 9, + end_byte: 11 + }] + ), + WordLocation::new( + "a".to_string(), + vec![ + TextRange { + start_byte: 1, + end_byte: 2 + }, + TextRange { + start_byte: 3, + end_byte: 4 + }, + ] + ), + ] + ); +} diff --git a/crates/codebook/tests/languages/main.rs b/crates/codebook/tests/languages/main.rs index f3a1daed..4d9c9817 100644 --- a/crates/codebook/tests/languages/main.rs +++ b/crates/codebook/tests/languages/main.rs @@ -1,3 +1,4 @@ +mod assert_helpers; mod utils; mod test_c; diff --git a/crates/codebook/tests/languages/test_c.rs b/crates/codebook/tests/languages/test_c.rs index d231e0e1..421113ab 100644 --- a/crates/codebook/tests/languages/test_c.rs +++ b/crates/codebook/tests/languages/test_c.rs @@ -1,477 +1,18 @@ -use codebook::{ - parser::{TextRange, WordLocation}, - queries::LanguageType, -}; - -#[test] -fn test_c_simple() { - super::utils::init_logging(); - let processor = super::utils::get_processor(); - let sample_text = r#" - int calculatr(int numbr1, int numbr2, char operashun) { - // This is an exampl function that performz calculashuns - int resalt = 0; - int misspellled; - return resalt + misspellled; - } - "#; - let expected = vec![ - "calculashuns", - "calculatr", - "exampl", - "misspellled", - "numbr", - "operashun", - "performz", - "resalt", - ]; - let binding = processor - .spell_check(sample_text, Some(LanguageType::C), None) - .to_vec(); - let mut misspelled = binding - .iter() - .map(|r| r.word.as_str()) - .collect::>(); - misspelled.sort(); - println!("Misspelled words: {misspelled:?}"); - assert_eq!(misspelled, expected); -} - -#[test] -fn test_c_comment_location() { - super::utils::init_logging(); - let sample_c = r#" - // Structur definition with misspellings - "#; - let expected = vec![WordLocation::new( - "Structur".to_string(), - vec![TextRange { - start_byte: 12, - end_byte: 20, - }], - )]; - let processor = super::utils::get_processor(); - let misspelled = processor - .spell_check(sample_c, Some(LanguageType::C), None) - .to_vec(); - println!("Misspelled words: {misspelled:?}"); - assert_eq!(misspelled, expected); - assert!(misspelled[0].locations.len() == 1); -} - -#[test] -fn test_c_struct() { - super::utils::init_logging(); - let sample_c = r#" - struct UserAccaunt { - char* usrrnamee; - int ballancee; - float intrest_rate; - }; - "#; - let expected = [ - WordLocation::new( - "Accaunt".to_string(), - vec![TextRange { - start_byte: 20, - end_byte: 27, - }], - ), - WordLocation::new( - "usrrnamee".to_string(), - vec![TextRange { - start_byte: 48, - end_byte: 57, - }], - ), - WordLocation::new( - "ballancee".to_string(), - vec![TextRange { - start_byte: 75, - end_byte: 84, - }], - ), - WordLocation::new( - "intrest".to_string(), - vec![TextRange { - start_byte: 104, - end_byte: 111, - }], - ), - ]; - let processor = super::utils::get_processor(); - let misspelled = processor - .spell_check(sample_c, Some(LanguageType::C), None) - .to_vec(); - println!("Misspelled words: {misspelled:?}"); - for expect in expected.iter() { - println!("Expecting {}", expect.word); - let result = misspelled.iter().find(|r| r.word == expect.word).unwrap(); - assert_eq!(result.word, expect.word); - assert_eq!(result.locations, expect.locations); - } -} - -#[test] -fn test_c_macros() { - super::utils::init_logging(); - let sample_text = r#" - #define MACROCONST 3 - #define MACROFUNC(macroparam) macroparam + 1 - "#; - // Needs to be lexicographically sorted - let expected = [ - WordLocation::new( - "MACROCONST".to_string(), - vec![TextRange { - start_byte: 17, - end_byte: 27, - }], - ), - WordLocation::new( - "MACROFUNC".to_string(), - vec![TextRange { - start_byte: 46, - end_byte: 55, - }], - ), - WordLocation::new( - "macroparam".to_string(), - vec![TextRange { - start_byte: 56, - end_byte: 66, - }], - ), - ]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} - -#[test] -fn test_c_unions() { - super::utils::init_logging(); - let sample_text = r#"union myunion { int int_val; };"#; - - // Needs to be lexicographically sorted - let expected = [WordLocation::new( - "myunion".to_string(), - vec![TextRange { - start_byte: 6, - end_byte: 13, - }], - )]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} - -#[test] -fn test_c_variable_declarations() { - super::utils::init_logging(); - let sample_text = r#" - int arrayy[3]; - int* pointerr; - int* pointerrarray[3]; - enum Role rolee; - union Union unionn; - struct User userr;"#; - - // Needs to be lexicographically sorted - let expected = [ - WordLocation::new( - "arrayy".to_string(), - vec![TextRange { - start_byte: 13, - end_byte: 19, - }], - ), - WordLocation::new( - "pointerr".to_string(), - vec![TextRange { - start_byte: 37, - end_byte: 45, - }], - ), - WordLocation::new( - "pointerrarray".to_string(), - vec![TextRange { - start_byte: 60, - end_byte: 73, - }], - ), - WordLocation::new( - "rolee".to_string(), - vec![TextRange { - start_byte: 96, - end_byte: 101, - }], - ), - WordLocation::new( - "unionn".to_string(), - vec![TextRange { - start_byte: 123, - end_byte: 129, - }], - ), - WordLocation::new( - "userr".to_string(), - vec![TextRange { - start_byte: 151, - end_byte: 156, - }], - ), - ]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} - -#[test] -fn test_c_variable_initializers() { - super::utils::init_logging(); - // Note: variables with initializers have slightly different syntax tree - // representations, so it useful to test them along with plain declarations. - let sample_text = r#" - int arrayy[3] = {}; - int* pointerr = NULL; - int* pointerrarray[3] = {}; - enum Role rolee = ROLE1; - union Union unionn = 10; - struct User userr = {};"#; - - // Needs to be lexicographically sorted - let expected = [ - WordLocation::new( - "arrayy".to_string(), - vec![TextRange { - start_byte: 13, - end_byte: 19, - }], - ), - WordLocation::new( - "pointerr".to_string(), - vec![TextRange { - start_byte: 42, - end_byte: 50, - }], - ), - WordLocation::new( - "pointerrarray".to_string(), - vec![TextRange { - start_byte: 72, - end_byte: 85, - }], - ), - WordLocation::new( - "rolee".to_string(), - vec![TextRange { - start_byte: 113, - end_byte: 118, - }], - ), - WordLocation::new( - "unionn".to_string(), - vec![TextRange { - start_byte: 148, - end_byte: 154, - }], - ), - WordLocation::new( - "userr".to_string(), - vec![TextRange { - start_byte: 181, - end_byte: 186, - }], - ), - ]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} - -#[test] -fn test_c_field_declarations() { - super::utils::init_logging(); - let sample_text = r#" - struct MyStruct { - int arrayy[3]; - int* pointerr; - int* pointerrarray[3]; - enum Role rolee; - union Union unionn; - struct User userr; - }"#; - - // Needs to be lexicographically sorted - let expected = [ - WordLocation::new( - "arrayy".to_string(), - vec![TextRange { - start_byte: 43, - end_byte: 49, - }], - ), - WordLocation::new( - "pointerr".to_string(), - vec![TextRange { - start_byte: 71, - end_byte: 79, - }], - ), - WordLocation::new( - "pointerrarray".to_string(), - vec![TextRange { - start_byte: 98, - end_byte: 111, - }], - ), - WordLocation::new( - "rolee".to_string(), - vec![TextRange { - start_byte: 138, - end_byte: 143, - }], - ), - WordLocation::new( - "unionn".to_string(), - vec![TextRange { - start_byte: 169, - end_byte: 175, - }], - ), - WordLocation::new( - "userr".to_string(), - vec![TextRange { - start_byte: 201, - end_byte: 206, - }], - ), - ]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} - -#[test] -fn test_c_strings() { - super::utils::init_logging(); - - // Note: we do not do spell checking across string concatenations, - // just individual strings. - let sample_text = r#" - char* str1 = "aaaa bbbb"; - str1 = "cccc" "valid string" "dddd"; - printf("I'm a multiline stringg\n" - "withh\nyyy"); - "#; +use codebook::queries::LanguageType; - // Needs to be lexicographically sorted - let expected = [ - WordLocation::new( - "aaaa".to_string(), - vec![TextRange { - start_byte: 23, - end_byte: 27, - }], - ), - WordLocation::new( - "bbbb".to_string(), - vec![TextRange { - start_byte: 28, - end_byte: 32, - }], - ), - WordLocation::new( - "cccc".to_string(), - vec![TextRange { - start_byte: 51, - end_byte: 55, - }], - ), - WordLocation::new( - "dddd".to_string(), - vec![TextRange { - start_byte: 73, - end_byte: 77, - }], - ), - WordLocation::new( - "stringg".to_string(), - vec![TextRange { - start_byte: 112, - end_byte: 119, - }], - ), - WordLocation::new( - "withh".to_string(), - vec![TextRange { - start_byte: 139, - end_byte: 144, - }], - ), - WordLocation::new( - "yyy".to_string(), - vec![TextRange { - start_byte: 146, - end_byte: 149, - }], - ), - ]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} +use crate::{ + assert_helpers::get_marked_misspellings, + assert_word_locations_match, + utils::{get_processor, get_sorted_misspellings}, +}; #[test] -fn test_c_typedef() { - super::utils::init_logging(); - let sample_text = r#"typedef int Mispelll;"#; - - // Needs to be lexicographically sorted - let expected = [WordLocation::new( - "Mispelll".to_string(), - vec![TextRange { - start_byte: 12, - end_byte: 20, - }], - )]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); -} +fn test_c_example_file() { + let expected_result = + get_marked_misspellings(include_str!("../examples/example.c.in"), "@@", "@@"); -#[test] -fn test_c_enum() { super::utils::init_logging(); - let sample_text = r#"enum Colrs { Grean };"#; - - // Needs to be lexicographically sorted - let expected = [ - WordLocation::new( - "Colrs".to_string(), - vec![TextRange { - start_byte: 5, - end_byte: 10, - }], - ), - WordLocation::new( - "Grean".to_string(), - vec![TextRange { - start_byte: 13, - end_byte: 18, - }], - ), - ]; - let processor = super::utils::get_processor(); - let mut misspelled = processor.spell_check(sample_text, Some(LanguageType::C), None); - misspelled.sort_by(|loc1, loc2| loc1.word.cmp(&loc2.word)); - assert_eq!(misspelled, expected); + let misspellings = + get_sorted_misspellings(&expected_result.content, get_processor(), LanguageType::C); + assert_word_locations_match!(misspellings, expected_result.misspellings); } diff --git a/crates/codebook/tests/languages/utils.rs b/crates/codebook/tests/languages/utils.rs index 00586621..57f28546 100644 --- a/crates/codebook/tests/languages/utils.rs +++ b/crates/codebook/tests/languages/utils.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use codebook::Codebook; +use codebook::{Codebook, parser::WordLocation, queries::LanguageType}; use codebook_config::{CodebookConfig, CodebookConfigMemory}; pub fn get_processor() -> Codebook { @@ -41,3 +41,15 @@ pub fn get_processor_with_tags(include_tags: Vec<&str>, exclude_tags: Vec<&str>) pub fn init_logging() { let _ = env_logger::builder().is_test(true).try_init(); } + +pub fn get_sorted_misspellings( + content: &str, + processor: Codebook, + language: LanguageType, +) -> Vec { + let mut misspellings = processor + .spell_check(content, Some(language), None) + .to_vec(); + misspellings.sort_by(|w1, w2| w1.word.cmp(&w2.word)); + misspellings +}