-
Notifications
You must be signed in to change notification settings - Fork 2
Cleanup / optimization as well as parallelizing the work #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0eeddc8
2f1dd62
17879d6
d962901
fc3fd0d
b38bd79
978d3b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,19 +1,25 @@ | ||
| #![cfg_attr(feature = "fail-on-warnings", deny(warnings))] | ||
| #![forbid(unsafe_code)] | ||
| extern crate git2; | ||
| #[macro_use] | ||
| extern crate structopt; | ||
| #[macro_use] | ||
| extern crate lazy_static; | ||
|
|
||
| use git2::{Commit, Repository}; | ||
| use std::collections::HashMap; | ||
| use std::env; | ||
| extern crate crossbeam_channel; | ||
| extern crate git2; | ||
| extern crate rayon; | ||
|
|
||
| use git2::Repository; | ||
| use std::collections::{HashMap, HashSet}; | ||
| use std::error::Error; | ||
| use std::fmt; | ||
| use std::path::PathBuf; | ||
| use std::{char, env, fmt}; | ||
| use structopt::clap::AppSettings; | ||
| use structopt::StructOpt; | ||
|
|
||
| static CURSES: &str = include_str!("words.txt"); | ||
| lazy_static! { | ||
| static ref CURSES_SET: HashSet<&'static str> = { CURSES.lines().collect() }; | ||
| } | ||
|
|
||
| #[derive(StructOpt, Debug)] | ||
| #[structopt( | ||
|
|
@@ -35,6 +41,7 @@ struct Repo { | |
| name: String, | ||
| total_commits: usize, | ||
| total_curses: usize, | ||
| authors: HashMap<String, Author>, | ||
| } | ||
|
|
||
| impl fmt::Display for Repo { | ||
|
|
@@ -48,14 +55,23 @@ impl fmt::Display for Repo { | |
| } | ||
|
|
||
| impl Repo { | ||
| fn new(name: &str) -> Self { | ||
| let name = name.to_string(); | ||
| fn new(name: impl Into<String>) -> Self { | ||
| Repo { | ||
| name, | ||
| name: name.into(), | ||
| total_commits: 0, | ||
| total_curses: 0, | ||
| authors: HashMap::new(), | ||
| } | ||
| } | ||
|
|
||
| fn author_for(&mut self, author_name: &str) -> &mut Author { | ||
| if !self.authors.contains_key(author_name) { | ||
| self.authors | ||
| .insert(author_name.to_owned(), Author::new(author_name)); | ||
| } | ||
|
|
||
| self.authors.get_mut(author_name).expect("exists") | ||
| } | ||
| } | ||
|
|
||
| #[derive(Debug, Eq, PartialEq)] | ||
|
|
@@ -77,23 +93,22 @@ impl fmt::Display for Author { | |
| } | ||
|
|
||
| impl Author { | ||
| fn new(name: &str) -> Self { | ||
| let name = name.to_string(); | ||
| let curses: HashMap<String, usize> = HashMap::new(); | ||
| fn new(name: impl Into<String>) -> Self { | ||
| Author { | ||
| name, | ||
| curses, | ||
| name: name.into(), | ||
| curses: HashMap::new(), | ||
| total_commits: 0, | ||
| total_curses: 0, | ||
| } | ||
| } | ||
|
|
||
| fn update_occurrence(&mut self, curse: &str) { | ||
| if !self.curses.contains_key(curse) { | ||
| self.curses.insert(curse.to_string(), 1); | ||
| } else { | ||
| self.curses.entry(curse.to_string()).and_modify(|i| *i += 1); | ||
| } | ||
| fn update_occurrences(&mut self, curse: &str, occurrences: usize) { | ||
| self.curses | ||
| .get_mut(curse) | ||
| .map(|count| *count += occurrences) | ||
| .unwrap_or_else(|| { | ||
| self.curses.insert(curse.to_owned(), occurrences); | ||
| }) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be replaced with: This does require a cast of the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above comment ^, this is a workaround so that we don't create a copy of the string unless necessary. |
||
| } | ||
|
|
||
| fn is_naughty(&self) -> bool { | ||
|
|
@@ -102,82 +117,92 @@ impl Author { | |
| } | ||
|
|
||
| fn main() -> Result<(), Box<Error>> { | ||
| let opt = Cli::from_args(); | ||
| let curses: Vec<&str> = CURSES.lines().collect(); | ||
| let path = if opt.directory.is_none() { | ||
| env::current_dir()? | ||
| } else { | ||
| opt.directory.unwrap() | ||
| let repo_path = { | ||
| let opt = Cli::from_args(); | ||
| if let Some(directory) = opt.directory { | ||
| directory | ||
| } else { | ||
| env::current_dir()? | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can also be replaced with the below: This is more concise, but the downside is that it makes it less obvious what the return of
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it's preference here. I would say that it feels weird to use |
||
| }; | ||
| let repo = Repository::open(&path)?; | ||
| let mut revwalk = repo.revwalk()?; | ||
| let mut commits: Vec<Commit> = Vec::new(); | ||
| revwalk.push_head()?; | ||
| for commit in revwalk { | ||
| let commit = repo.find_commit(commit?)?; | ||
| commits.push(commit); | ||
| } | ||
| let mut repo = Repo::new(path.file_name().unwrap().to_str().unwrap()); | ||
| let mut authors: Vec<Author> = find_authors(&commits); | ||
|
|
||
| let repo = Repository::open(&repo_path)?; | ||
| let commits = { | ||
| let mut revwalk = repo.revwalk()?; | ||
| let mut commits = Vec::new(); | ||
| revwalk.push_head()?; | ||
| for commit_id in revwalk { | ||
| let commit = repo.find_commit(commit_id?)?; | ||
| commits.push(commit); | ||
| } | ||
| commits | ||
| }; | ||
|
|
||
| let (s, r) = crossbeam_channel::unbounded(); | ||
|
|
||
| let mut repo = Repo::new(repo_path.file_name().unwrap().to_str().unwrap()); | ||
| for commit in &commits { | ||
| let text = commit.message().unwrap().to_lowercase().to_string(); | ||
| let author = commit.author().name().unwrap().to_string(); | ||
| let index = authors | ||
| .iter() | ||
| .position(|i| i.name == author) | ||
| .expect("Could not find author"); | ||
| let mut author = &mut authors[index]; | ||
| author.total_commits += 1; | ||
| repo.total_commits += 1; | ||
| for word in text.split_whitespace() { | ||
| let word = clean_word(word); | ||
| if naughty_word(word.as_str(), &curses) { | ||
| author.total_curses += 1; | ||
| repo.total_curses += 1; | ||
| author.update_occurrence(word.as_str()); | ||
| if let (Some(author_name), Some(commit_message)) = ( | ||
| commit.author().name().map(|n| n.to_owned()), | ||
| commit.message().map(|msg| msg.to_lowercase()), | ||
| ) { | ||
| let job_sender = s.clone(); | ||
| rayon::spawn(move || { | ||
| let mut naughty_words = HashMap::new(); | ||
| for word in split_into_clean_words(&commit_message).filter(|w| naughty_word(w)) { | ||
| naughty_words | ||
| .get_mut(word) | ||
| .map(|count| *count += 1) | ||
| .unwrap_or_else(|| { | ||
| naughty_words.insert(word.to_owned(), 1); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The change from L111 could also be applied here
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See previous comment ^, avoiding string copies unless necessary. I would like to find a crate that extends the |
||
| }) | ||
| } | ||
| job_sender.send((author_name, naughty_words)); | ||
| }) | ||
| } else { | ||
| println!( | ||
| "Warning: skipping commit {:?} because author name OR commit message missing", | ||
| commit | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| // Drop the sender here so that when the work is done the channel closes | ||
| drop(s); | ||
|
|
||
| while let Some((author_name, naughty_words)) = r.recv() { | ||
| let mut total_curses_added = 0; | ||
|
|
||
| { | ||
| let author = repo.author_for(&author_name); | ||
| author.total_commits += 1; | ||
| for (word, occurrences) in naughty_words { | ||
| author.total_curses += occurrences; | ||
| total_curses_added += occurrences; | ||
| author.update_occurrences(&word, occurrences); | ||
| } | ||
| } | ||
|
|
||
| repo.total_commits += 1; | ||
| repo.total_curses += total_curses_added; | ||
| } | ||
|
|
||
| println!("{}", repo); | ||
| for mut author in authors { | ||
| for mut author in repo.authors.values() { | ||
| if author.is_naughty() { | ||
| println!("{}", author); | ||
| } | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| fn clean_word(word: &str) -> String { | ||
| let mut res = String::with_capacity(word.len()); | ||
| for b in word.chars() { | ||
| match b { | ||
| '!' => {} | ||
| '?' => {} | ||
| ':' => {} | ||
| ';' => {} | ||
| '.' => {} | ||
| ',' => {} | ||
| _ => res.push(b), | ||
| } | ||
| } | ||
| res | ||
| fn split_into_clean_words(l: &str) -> impl Iterator<Item = &str> { | ||
| l.split(|c| !char::is_alphabetic(c)) | ||
| .filter(|w| !w.is_empty()) | ||
| } | ||
|
|
||
| fn find_authors(commits: &[Commit]) -> Vec<Author> { | ||
| let mut names: Vec<String> = Vec::new(); | ||
| let mut res: Vec<Author> = Vec::new(); | ||
| for commit in commits { | ||
| let name = commit.author().name().unwrap().to_string(); | ||
| if !names.contains(&name) { | ||
| res.push(Author::new(name.as_str())); | ||
| } | ||
| names.push(name); | ||
| } | ||
| res | ||
| } | ||
|
|
||
| fn naughty_word(word: &str, naughty_list: &[&str]) -> bool { | ||
| naughty_list.contains(&word) | ||
| fn naughty_word(word: &str) -> bool { | ||
| CURSES_SET.contains(&word) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
|
@@ -186,19 +211,27 @@ mod test { | |
|
|
||
| #[test] | ||
| fn test_naughty_words() { | ||
| let curses: Vec<&str> = CURSES.lines().collect(); | ||
| assert!(naughty_word("fuck", &curses)); | ||
| assert!(naughty_word("cyberfuckers", &curses)); | ||
| assert!(!naughty_word("pretty", &curses)); | ||
| assert!(naughty_word("fuck")); | ||
| assert!(naughty_word("cyberfuckers")); | ||
| assert!(!naughty_word("pretty")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_clean_word() { | ||
| let w1 = "This! is a string: with, some. words in? it;".to_string(); | ||
| let w1 = clean_word(w1.as_str()); | ||
| let clean_words = split_into_clean_words("This! is a string: with, some. words in? it;") | ||
| .collect::<Vec<_>>(); | ||
| assert_eq!( | ||
| "This is a string with some words in it", | ||
| w1.trim().to_string() | ||
| clean_words, | ||
| vec!["This", "is", "a", "string", "with", "some", "words", "in", "it"] | ||
| ); | ||
| } | ||
|
|
||
| // The new implementation yields str slices and does not allocate, which | ||
| // is good. However, it differs from the previous implementation because | ||
| // words with invalid characters in the middle are not joined together. | ||
| #[test] | ||
| fn does_not_join_words_with_invalid_characters_in_middle() { | ||
| let clean_words = split_into_clean_words("inv!alid").collect::<Vec<_>>(); | ||
| assert_eq!(clean_words, vec!["inv", "alid"]); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be shortened to the below:
self.authors.entry(author_name.into()).or_insert(Author::new(author_name))This makes it a more idiomatic method for a "get or create" pattern.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I considered the approach, but I really wanted to not allocated a new
Stringunless it was strictly necessary (it's only needed when inserting into the HashMap). This is a long standing problem, I would love to know if there's an easier way to do this with stdlib or if there's a crate that solves this (see: https://users.rust-lang.org/t/efficient-string-hashmaps-for-a-frequency-count/7752/8).Additionally, you would probably want to use
or_insert_withinstead ofor_insertas theAuthor::new(..)is evaluated eagerly for each call withor_insert(andAuthor::newcopies a string).Something like:
self.authors.entry(author_name.into()).or_insert_with(|| Author::new(author_name)).