Ok(())
}
-pub fn tokenize<'a>(contents: &'a str) -> Vec<Vec<&'a str>> {
+pub fn tokenize(contents: &str) -> Vec<Vec<String>> {
let mut tokens = Vec::new();
- for line in contents.lines() {
- let mut line_tokens: Vec<&str> = line.split(' ').collect();
- line_tokens.retain(|t| t.len() > 0);
- tokens.push(line_tokens)
+ let sentences: Vec<_> = contents
+ .split(&['.', '!', '?'])
+ .filter(|c| !c.is_empty())
+ .filter(|c| c.len() > 1)
+ .collect();
+ for sentence in sentences {
+ let raw_tokens: Vec<&str> = sentence.split(" ").filter(|t| !t.is_empty()).collect();
+ let mut cleaned_tokens: Vec<String> = Vec::new();
+ for token in raw_tokens {
+ let mut r_t = String::from(token);
+ r_t.retain(|c| !c.is_ascii_punctuation());
+ r_t = r_t.chars().filter(|&c| c != '\n').collect();
+ cleaned_tokens.push(r_t.clone());
+ }
+ tokens.push(cleaned_tokens);
}
tokens
}
use super::*;
#[test]
- fn single_line_whitespace_simple() {
+ fn single_sentence() {
let contents = "\
- dette er en banal test.";
+ dette er en banal test";
let mut left_side = Vec::new();
- let c = vec!["dette", "er", "en", "banal", "test."];
+ let c = vec!["dette", "er", "en", "banal", "test"];
left_side.push(c);
let right_side = tokenize(&contents);
assert_eq!(left_side, right_side);
}
#[test]
- fn multi_line_whitespace_simple() {
+ fn multiple_sentences() {
let contents = "\
- dette er en banal test \n som inneholder to linjer";
+ Dette er den første setningen. Dette er den andre setningen.";
let mut left_side = Vec::new();
- let c1 = vec!["dette", "er", "en", "banal", "test"];
- let c2 = vec!["som", "inneholder", "to", "linjer"];
+ let c1 = vec!["Dette", "er", "den", "første", "setningen"];
+ let c2 = vec!["Dette", "er", "den", "andre", "setningen"];
left_side.push(c1);
left_side.push(c2);
let right_side = tokenize(&contents);