1 | | - | use std::{fs, path::PathBuf}; |
2 | | - | use indicatif::{HumanDuration, MultiProgress, ProgressBar, ProgressStyle, ParallelProgressIterator}; |
3 | 1 | | use anyhow::Result; |
4 | | - | use fxhash::hash32 as hasher; |
| 2 | + | use dashmap::DashMap; |
| 3 | + | use fxhash::hash64 as hasher; |
5 | 4 | | use glob::glob; |
6 | | - | use itertools::Itertools; |
| 5 | + | use indicatif::{ParallelProgressIterator, ProgressStyle}; |
| 6 | + | use memmap2::Mmap; |
7 | 7 | | use rayon::prelude::*; |
| 8 | + | use std::hash::Hasher; |
| 9 | + | use std::{fs, path::PathBuf}; |
8 | 10 | | |
9 | | - | use crate::{ |
10 | | - | database::{self, File}, |
11 | | - | params::Params, |
12 | | - | }; |
| 11 | + | use crate::{file_manager::File, params::Params}; |
13 | 12 | | |
14 | | - | pub fn duplicates(app_opts: &Params, connection: &sqlite::Connection) -> Result<Vec<File>> { |
15 | | - | let scan_results = scan(app_opts, connection)?; |
16 | | - | let base_path = app_opts.get_directory()?; |
17 | | - | |
18 | | - | index_files(scan_results, connection)?; |
19 | | - | database::duplicate_hashes(connection, &base_path) |
| 13 | + | #[derive(Clone, Copy)] |
| 14 | + | enum IndexCritera { |
| 15 | + | Size, |
| 16 | + | Hash, |
20 | 17 | | } |
21 | 18 | | |
22 | | - | fn get_glob_patterns(opts: &Params, directory: &str) -> Vec<PathBuf> { |
23 | | - | opts.types |
24 | | - | .clone() |
25 | | - | .unwrap_or_else(|| String::from("*")) |
26 | | - | .split(',') |
27 | | - | .map(|filetype| format!("*.{}", filetype)) |
28 | | - | .map(|filetype| { |
29 | | - | vec![directory.to_owned(), String::from("**"), filetype] |
30 | | - | .iter() |
31 | | - | .collect() |
32 | | - | }) |
33 | | - | .collect() |
34 | | - | } |
| 19 | + | pub fn duplicates(app_opts: &Params) -> Result<DashMap<String, Vec<File>>> { |
| 20 | + | let scan_results = scan(app_opts)?; |
| 21 | + | let size_index_store = index_files(scan_results, IndexCritera::Size)?; |
35 | 22 | | |
36 | | - | fn is_indexed_file(path: impl Into<String>, indexed: &[File]) -> bool { |
37 | | - | indexed |
38 | | - | .iter() |
39 | | - | .map(|file| file.path.clone()) |
40 | | - | .contains(&path.into()) |
| 23 | + | let sizewize_duplicate_files = size_index_store |
| 24 | + | .into_par_iter() |
| 25 | + | .filter(|(_, files)| files.len() > 1) |
| 26 | + | .map(|(_, files)| files) |
| 27 | + | .flatten() |
| 28 | + | .collect::<Vec<File>>(); |
| 29 | + | |
| 30 | + | if sizewize_duplicate_files.len() > 1 { |
| 31 | + | let size_wise_duplicate_paths = sizewize_duplicate_files |
| 32 | + | .into_par_iter() |
| 33 | + | .map(|file| file.path) |
| 34 | + | .collect::<Vec<String>>(); |
| 35 | + | |
| 36 | + | let hash_index_store = index_files(size_wise_duplicate_paths, IndexCritera::Hash)?; |
| 37 | + | let duplicate_files = hash_index_store |
| 38 | + | .into_par_iter() |
| 39 | + | .filter(|(_, files)| files.len() > 1) |
| 40 | + | .collect(); |
| 41 | + | |
| 42 | + | Ok(duplicate_files) |
| 43 | + | } else { |
| 44 | + | Ok(DashMap::new()) |
| 45 | + | } |
41 | 46 | | } |
42 | 47 | | |
43 | | - | fn scan(app_opts: &Params, connection: &sqlite::Connection) -> Result<Vec<String>> { |
44 | | - | let directory = app_opts.get_directory()?; |
45 | | - | let glob_patterns: Vec<PathBuf> = get_glob_patterns(app_opts, &directory); |
46 | | - | let indexed_paths = database::indexed_paths(connection)?; |
| 48 | + | fn scan(app_opts: &Params) -> Result<Vec<String>> { |
| 49 | + | let glob_patterns: Vec<PathBuf> = app_opts.get_glob_patterns(); |
47 | 50 | | let files: Vec<String> = glob_patterns |
48 | 51 | | .par_iter() |
49 | | - | .progress_with_style(ProgressStyle::with_template("{spinner:.green} [scanning files] [{wide_bar:.cyan/blue}] {pos}/{len} files").unwrap()) |
| 52 | + | .progress_with_style(ProgressStyle::with_template( |
| 53 | + | "{spinner:.green} [scanning files] [{wide_bar:.cyan/blue}] {pos}/{len} files", |
| 54 | + | )?) |
50 | 55 | | .filter_map(|glob_pattern| glob(glob_pattern.as_os_str().to_str()?).ok()) |
51 | 56 | | .flat_map(|file_vec| { |
52 | 57 | | file_vec |
53 | 58 | | .filter_map(|x| Some(x.ok()?.as_os_str().to_str()?.to_string())) |
54 | | - | .filter(|fpath| !is_indexed_file(fpath, &indexed_paths)) |
55 | 59 | | .filter(|glob_result| { |
56 | 60 | | fs::metadata(glob_result) |
57 | 61 | | .map(|f| f.is_file()) |
| skipped 6 lines |
64 | 68 | | Ok(files) |
65 | 69 | | } |
66 | 70 | | |
67 | | - | fn index_files(files: Vec<String>, connection: &sqlite::Connection) -> Result<()> { |
68 | | - | let hashed: Vec<File> = files |
| 71 | + | fn process_file_size_index(fpath: String) -> Result<File> { |
| 72 | + | Ok(File { |
| 73 | + | path: fpath.clone(), |
| 74 | + | size: Some(fs::metadata(fpath)?.len()), |
| 75 | + | hash: None, |
| 76 | + | }) |
| 77 | + | } |
| 78 | + | |
| 79 | + | fn process_file_hash_index(fpath: String) -> Result<File> { |
| 80 | + | Ok(File { |
| 81 | + | path: fpath.clone(), |
| 82 | + | size: None, |
| 83 | + | hash: Some(hash_file(&fpath).unwrap_or_default()), |
| 84 | + | }) |
| 85 | + | } |
| 86 | + | |
| 87 | + | fn process_file_index( |
| 88 | + | fpath: String, |
| 89 | + | store: &DashMap<String, Vec<File>>, |
| 90 | + | index_criteria: IndexCritera, |
| 91 | + | ) { |
| 92 | + | match index_criteria { |
| 93 | + | IndexCritera::Size => { |
| 94 | + | let processed_file = process_file_size_index(fpath).unwrap(); |
| 95 | + | store |
| 96 | + | .entry(processed_file.size.unwrap_or_default().to_string()) |
| 97 | + | .and_modify(|fileset| fileset.push(processed_file.clone())) |
| 98 | + | .or_insert_with(|| vec![processed_file]); |
| 99 | + | } |
| 100 | + | IndexCritera::Hash => { |
| 101 | + | let processed_file = process_file_hash_index(fpath).unwrap(); |
| 102 | + | let indexhash = processed_file.clone().hash.unwrap_or_default(); |
| 103 | + | |
| 104 | + | store |
| 105 | + | .entry(indexhash) |
| 106 | + | .and_modify(|fileset| fileset.push(processed_file.clone())) |
| 107 | + | .or_insert_with(|| vec![processed_file]); |
| 108 | + | } |
| 109 | + | } |
| 110 | + | } |
| 111 | + | |
| 112 | + | fn index_files( |
| 113 | + | files: Vec<String>, |
| 114 | + | index_criteria: IndexCritera, |
| 115 | + | ) -> Result<DashMap<String, Vec<File>>> { |
| 116 | + | let store: DashMap<String, Vec<File>> = DashMap::new(); |
| 117 | + | files |
69 | 118 | | .into_par_iter() |
70 | | - | .progress_with_style(ProgressStyle::with_template("{spinner:.green} [indexing files] [{wide_bar:.cyan/blue}] {pos}/{len} files").unwrap()) |
71 | | - | .filter_map(|file| { |
72 | | - | let hash = hash_file(&file).ok()?; |
73 | | - | Some(database::File { path: file, hash }) |
74 | | - | }) |
75 | | - | .collect(); |
| 119 | + | .progress_with_style(ProgressStyle::with_template( |
| 120 | + | "{spinner:.green} [indexing files] [{wide_bar:.cyan/blue}] {pos}/{len} files", |
| 121 | + | )?) |
| 122 | + | .for_each(|file| process_file_index(file, &store, index_criteria)); |
76 | 123 | | |
77 | | - | hashed |
78 | | - | .iter() |
79 | | - | .try_for_each(|file| database::put(file, connection)) |
| 124 | + | Ok(store) |
80 | 125 | | } |
81 | 126 | | |
82 | | - | pub fn hash_file(filepath: &str) -> Result<String> { |
| 127 | + | pub fn incremental_hashing(filepath: &str) -> Result<String> { |
| 128 | + | let file = fs::File::open(filepath)?; |
| 129 | + | let fmap = unsafe { Mmap::map(&file)? }; |
| 130 | + | let mut inchasher = fxhash::FxHasher::default(); |
| 131 | + | |
| 132 | + | fmap.chunks(1_000_000) |
| 133 | + | .for_each(|mega| inchasher.write(mega)); |
| 134 | + | |
| 135 | + | Ok(format!("{}", inchasher.finish())) |
| 136 | + | } |
| 137 | + | |
| 138 | + | pub fn standard_hashing(filepath: &str) -> Result<String> { |
83 | 139 | | let file = fs::read(filepath)?; |
84 | | - | let hash = hasher(&*file).to_string(); |
| 140 | + | Ok(hasher(&*file).to_string()) |
| 141 | + | } |
| 142 | + | |
| 143 | + | pub fn hash_file(filepath: &str) -> Result<String> { |
| 144 | + | let filemeta = fs::metadata(filepath)?; |
85 | 145 | | |
86 | | - | Ok(hash) |
| 146 | + | // NOTE: USE INCREMENTAL HASHING ONLY FOR FILES > 100MB |
| 147 | + | match filemeta.len() < 100_000_000 { |
| 148 | + | true => standard_hashing(filepath), |
| 149 | + | false => incremental_hashing(filepath), |
| 150 | + | } |
87 | 151 | | } |
88 | 152 | | |