From 390a801a5314720680227b3fdb86d66f88442a54 Mon Sep 17 00:00:00 2001 From: Sondre Wold Date: Wed, 1 May 2024 19:57:17 +0200 Subject: [PATCH] Direct approach --- mitok/src/lib.rs | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/mitok/src/lib.rs b/mitok/src/lib.rs index b9a9b4d..324d151 100644 --- a/mitok/src/lib.rs +++ b/mitok/src/lib.rs @@ -25,12 +25,23 @@ pub fn run(config: Config) -> Result<(), Box> { Ok(()) } -pub fn tokenize<'a>(contents: &'a str) -> Vec> { +pub fn tokenize(contents: &str) -> Vec> { let mut tokens = Vec::new(); - for line in contents.lines() { - let mut line_tokens: Vec<&str> = line.split(' ').collect(); - line_tokens.retain(|t| t.len() > 0); - tokens.push(line_tokens) + let sentences: Vec<_> = contents + .split(&['.', '!', '?']) + .filter(|c| !c.is_empty()) + .filter(|c| c.len() > 1) + .collect(); + for sentence in sentences { + let raw_tokens: Vec<&str> = sentence.split(" ").filter(|t| !t.is_empty()).collect(); + let mut cleaned_tokens: Vec = Vec::new(); + for token in raw_tokens { + let mut r_t = String::from(token); + r_t.retain(|c| !c.is_ascii_punctuation()); + r_t = r_t.chars().filter(|&c| c != '\n').collect(); + cleaned_tokens.push(r_t.clone()); + } + tokens.push(cleaned_tokens); } tokens } @@ -40,25 +51,25 @@ mod tests { use super::*; #[test] - fn single_line_whitespace_simple() { + fn single_sentence() { let contents = "\ - dette er en banal test."; + dette er en banal test"; let mut left_side = Vec::new(); - let c = vec!["dette", "er", "en", "banal", "test."]; + let c = vec!["dette", "er", "en", "banal", "test"]; left_side.push(c); let right_side = tokenize(&contents); assert_eq!(left_side, right_side); } #[test] - fn multi_line_whitespace_simple() { + fn multiple_sentences() { let contents = "\ - dette er en banal test \n som inneholder to linjer"; + Dette er den første setningen. Dette er den andre setningen."; let mut left_side = Vec::new(); - let c1 = vec!["dette", "er", "en", "banal", "test"]; - let c2 = vec!["som", "inneholder", "to", "linjer"]; + let c1 = vec!["Dette", "er", "den", "første", "setningen"]; + let c2 = vec!["Dette", "er", "den", "andre", "setningen"]; left_side.push(c1); left_side.push(c2); let right_side = tokenize(&contents); -- 2.39.5