Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ fail-on-warnings = []
[dependencies]
git2 = "0.7"
structopt = "0.2"
lazy_static = "1.1.0"
rayon = "1.0.2"
crossbeam-channel = "0.2.6"
211 changes: 122 additions & 89 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
#![cfg_attr(feature = "fail-on-warnings", deny(warnings))]
#![forbid(unsafe_code)]
extern crate git2;
#[macro_use]
extern crate structopt;
#[macro_use]
extern crate lazy_static;

use git2::{Commit, Repository};
use std::collections::HashMap;
use std::env;
extern crate crossbeam_channel;
extern crate git2;
extern crate rayon;

use git2::Repository;
use std::collections::{HashMap, HashSet};
use std::error::Error;
use std::fmt;
use std::path::PathBuf;
use std::{char, env, fmt};
use structopt::clap::AppSettings;
use structopt::StructOpt;

static CURSES: &str = include_str!("words.txt");
lazy_static! {
static ref CURSES_SET: HashSet<&'static str> = { CURSES.lines().collect() };
}

#[derive(StructOpt, Debug)]
#[structopt(
Expand All @@ -35,6 +41,7 @@ struct Repo {
name: String,
total_commits: usize,
total_curses: usize,
authors: HashMap<String, Author>,
}

impl fmt::Display for Repo {
Expand All @@ -48,14 +55,23 @@ impl fmt::Display for Repo {
}

impl Repo {
fn new(name: &str) -> Self {
let name = name.to_string();
fn new(name: impl Into<String>) -> Self {
Repo {
name,
name: name.into(),
total_commits: 0,
total_curses: 0,
authors: HashMap::new(),
}
}

fn author_for(&mut self, author_name: &str) -> &mut Author {
if !self.authors.contains_key(author_name) {
self.authors
.insert(author_name.to_owned(), Author::new(author_name));
}

self.authors.get_mut(author_name).expect("exists")
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be shortened to the below:
self.authors.entry(author_name.into()).or_insert(Author::new(author_name))

This makes it a more idiomatic method for a "get or create" pattern.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered the approach, but I really wanted to not allocated a new String unless it was strictly necessary (it's only needed when inserting into the HashMap). This is a long standing problem, I would love to know if there's an easier way to do this with stdlib or if there's a crate that solves this (see: https://users.rust-lang.org/t/efficient-string-hashmaps-for-a-frequency-count/7752/8).

Additionally, you would probably want to use or_insert_with instead of or_insert as the Author::new(..) is evaluated eagerly for each call with or_insert (and Author::new copies a string).

Something like: self.authors.entry(author_name.into()).or_insert_with(|| Author::new(author_name)).

}

#[derive(Debug, Eq, PartialEq)]
Expand All @@ -77,23 +93,22 @@ impl fmt::Display for Author {
}

impl Author {
fn new(name: &str) -> Self {
let name = name.to_string();
let curses: HashMap<String, usize> = HashMap::new();
fn new(name: impl Into<String>) -> Self {
Author {
name,
curses,
name: name.into(),
curses: HashMap::new(),
total_commits: 0,
total_curses: 0,
}
}

fn update_occurrence(&mut self, curse: &str) {
if !self.curses.contains_key(curse) {
self.curses.insert(curse.to_string(), 1);
} else {
self.curses.entry(curse.to_string()).and_modify(|i| *i += 1);
}
fn update_occurrences(&mut self, curse: &str, occurrences: usize) {
self.curses
.get_mut(curse)
.map(|count| *count += occurrences)
.unwrap_or_else(|| {
self.curses.insert(curse.to_owned(), occurrences);
})
Copy link

@pantsman0 pantsman0 Oct 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be replaced with:
*self.curses.entry(curse.into()).or_insert(0) += occurrences

This does require a cast of the curse argument, since you need to pass a K instead of Q: Borrow<K>, but I think it makes it clearer what is going on.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above comment ^, this is a workaround so that we don't create a copy of the string unless necessary.

}

fn is_naughty(&self) -> bool {
Expand All @@ -102,82 +117,92 @@ impl Author {
}

fn main() -> Result<(), Box<Error>> {
let opt = Cli::from_args();
let curses: Vec<&str> = CURSES.lines().collect();
let path = if opt.directory.is_none() {
env::current_dir()?
} else {
opt.directory.unwrap()
let repo_path = {
let opt = Cli::from_args();
if let Some(directory) = opt.directory {
directory
} else {
env::current_dir()?
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can also be replaced with the below:

let repo_path = match Cli::from_args().directory {
        Some(d) => d,
        None => env::current_dir()?
};

This is more concise, but the downside is that it makes it less obvious what the return of from_args() is and why it would have a directory member.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's preference here. I would say that it feels weird to use Cli::from_args().directory as usually there are other parameters / flags, but there aren't any currently so it would be fine as well :P.

};
let repo = Repository::open(&path)?;
let mut revwalk = repo.revwalk()?;
let mut commits: Vec<Commit> = Vec::new();
revwalk.push_head()?;
for commit in revwalk {
let commit = repo.find_commit(commit?)?;
commits.push(commit);
}
let mut repo = Repo::new(path.file_name().unwrap().to_str().unwrap());
let mut authors: Vec<Author> = find_authors(&commits);

let repo = Repository::open(&repo_path)?;
let commits = {
let mut revwalk = repo.revwalk()?;
let mut commits = Vec::new();
revwalk.push_head()?;
for commit_id in revwalk {
let commit = repo.find_commit(commit_id?)?;
commits.push(commit);
}
commits
};

let (s, r) = crossbeam_channel::unbounded();

let mut repo = Repo::new(repo_path.file_name().unwrap().to_str().unwrap());
for commit in &commits {
let text = commit.message().unwrap().to_lowercase().to_string();
let author = commit.author().name().unwrap().to_string();
let index = authors
.iter()
.position(|i| i.name == author)
.expect("Could not find author");
let mut author = &mut authors[index];
author.total_commits += 1;
repo.total_commits += 1;
for word in text.split_whitespace() {
let word = clean_word(word);
if naughty_word(word.as_str(), &curses) {
author.total_curses += 1;
repo.total_curses += 1;
author.update_occurrence(word.as_str());
if let (Some(author_name), Some(commit_message)) = (
commit.author().name().map(|n| n.to_owned()),
commit.message().map(|msg| msg.to_lowercase()),
) {
let job_sender = s.clone();
rayon::spawn(move || {
let mut naughty_words = HashMap::new();
for word in split_into_clean_words(&commit_message).filter(|w| naughty_word(w)) {
naughty_words
.get_mut(word)
.map(|count| *count += 1)
.unwrap_or_else(|| {
naughty_words.insert(word.to_owned(), 1);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change from L111 could also be applied here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous comment ^, avoiding string copies unless necessary.

I would like to find a crate that extends the HashMap<String, T> API to clean up the duplication though..

})
}
job_sender.send((author_name, naughty_words));
})
} else {
println!(
"Warning: skipping commit {:?} because author name OR commit message missing",
commit
);
}
}

// Drop the sender here so that when the work is done the channel closes
drop(s);

while let Some((author_name, naughty_words)) = r.recv() {
let mut total_curses_added = 0;

{
let author = repo.author_for(&author_name);
author.total_commits += 1;
for (word, occurrences) in naughty_words {
author.total_curses += occurrences;
total_curses_added += occurrences;
author.update_occurrences(&word, occurrences);
}
}

repo.total_commits += 1;
repo.total_curses += total_curses_added;
}

println!("{}", repo);
for mut author in authors {
for mut author in repo.authors.values() {
if author.is_naughty() {
println!("{}", author);
}
}
Ok(())
}

fn clean_word(word: &str) -> String {
let mut res = String::with_capacity(word.len());
for b in word.chars() {
match b {
'!' => {}
'?' => {}
':' => {}
';' => {}
'.' => {}
',' => {}
_ => res.push(b),
}
}
res
fn split_into_clean_words(l: &str) -> impl Iterator<Item = &str> {
l.split(|c| !char::is_alphabetic(c))
.filter(|w| !w.is_empty())
}

fn find_authors(commits: &[Commit]) -> Vec<Author> {
let mut names: Vec<String> = Vec::new();
let mut res: Vec<Author> = Vec::new();
for commit in commits {
let name = commit.author().name().unwrap().to_string();
if !names.contains(&name) {
res.push(Author::new(name.as_str()));
}
names.push(name);
}
res
}

fn naughty_word(word: &str, naughty_list: &[&str]) -> bool {
naughty_list.contains(&word)
fn naughty_word(word: &str) -> bool {
CURSES_SET.contains(&word)
}

#[cfg(test)]
Expand All @@ -186,19 +211,27 @@ mod test {

#[test]
fn test_naughty_words() {
let curses: Vec<&str> = CURSES.lines().collect();
assert!(naughty_word("fuck", &curses));
assert!(naughty_word("cyberfuckers", &curses));
assert!(!naughty_word("pretty", &curses));
assert!(naughty_word("fuck"));
assert!(naughty_word("cyberfuckers"));
assert!(!naughty_word("pretty"));
}

#[test]
fn test_clean_word() {
let w1 = "This! is a string: with, some. words in? it;".to_string();
let w1 = clean_word(w1.as_str());
let clean_words = split_into_clean_words("This! is a string: with, some. words in? it;")
.collect::<Vec<_>>();
assert_eq!(
"This is a string with some words in it",
w1.trim().to_string()
clean_words,
vec!["This", "is", "a", "string", "with", "some", "words", "in", "it"]
);
}

// The new implementation yields str slices and does not allocate, which
// is good. However, it differs from the previous implementation because
// words with invalid characters in the middle are not joined together.
#[test]
fn does_not_join_words_with_invalid_characters_in_middle() {
let clean_words = split_into_clean_words("inv!alid").collect::<Vec<_>>();
assert_eq!(clean_words, vec!["inv", "alid"]);
}
}