STRLCPY/deduplicator

■ ■ ■ ■ ■ ■

Cargo.lock

		skipped 70 lines
71	71		checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c"
72	72
73	73		[[package]]
	74	+	name = "bytesize"
	75	+	version = "1.1.0"
	76	+	source = "registry+https://github.com/rust-lang/crates.io-index"
	77	+	checksum = "6c58ec36aac5066d5ca17df51b3e70279f5670a72102f5752cb7e7c856adfc70"
	78	+
	79	+	[[package]]
74	80		name = "cc"
75	81		version = "1.0.78"
76	82		source = "registry+https://github.com/rust-lang/crates.io-index"
		skipped 225 lines
302	308		version = "0.1.1"
303	309		dependencies = [
304	310		"anyhow",
	311	+	"bytesize",
305	312		"chrono",
306	313		"clap",
307	314		"colored",
308	315		"dashmap",
309	316		"fxhash",
310	317		"glob",
311		-	"humansize",
312	318		"indicatif",
313	319		"itertools",
314	320		"memmap2",
		skipped 121 lines
436	442		]
437	443
438	444		[[package]]
439		-	name = "humansize"
440		-	version = "2.1.2"
441		-	source = "registry+https://github.com/rust-lang/crates.io-index"
442		-	checksum = "4e682e2bd70ecbcce5209f11a992a4ba001fea8e60acf7860ce007629e6d2756"
443		-	dependencies = [
444		-	"libm",
445		-	]
446		-
447		-	[[package]]
448	445		name = "iana-time-zone"
449	446		version = "0.1.53"
450	447		source = "registry+https://github.com/rust-lang/crates.io-index"
		skipped 88 lines
539	536		version = "0.2.139"
540	537		source = "registry+https://github.com/rust-lang/crates.io-index"
541	538		checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
542		-
543		-	[[package]]
544		-	name = "libm"
545		-	version = "0.2.6"
546		-	source = "registry+https://github.com/rust-lang/crates.io-index"
547		-	checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
548	539
549	540		[[package]]
550	541		name = "link-cplusplus"
		skipped 600 lines

■ ■ ■ ■ ■ ■

Cargo.toml

		skipped 9 lines
10	10
11	11		[dependencies]
12	12		anyhow = "1.0.68"
	13	+	bytesize = "1.1.0"
13	14		chrono = "0.4.23"
14	15		clap = { version = "4.0.32", features = ["derive"] }
15	16		colored = "2.0.0"
16	17		dashmap = { version = "5.4.0", features = ["rayon"] }
17	18		fxhash = "0.2.1"
18	19		glob = "0.3.0"
19		-	humansize = "2.1.2"
20	20		indicatif = { version = "0.17.2", features = ["rayon", "tokio"] }
21	21		itertools = "0.10.5"
22	22		memmap2 = "0.5.8"
		skipped 6 lines

■ ■ ■ ■ ■ ■

README.md

		skipped 13 lines
14	14		Usage: deduplicator [OPTIONS]
15	15
16	16		Options:
17		-	-t, --types <TYPES> Filetypes to deduplicate (default = all)
18		-	--dir <DIR> Run Deduplicator on dir different from pwd
19		-	-i, --interactive Delete files interactively
20		-	-h, --help Print help information
21		-	-V, --version Print version information
	17	+	-t, --types <TYPES> Filetypes to deduplicate (default = all)
	18	+	--dir <DIR> Run Deduplicator on dir different from pwd
	19	+	-i, --interactive Delete files interactively
	20	+	-m, --minsize <MINSIZE> Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0]
	21	+	-h, --help Print help information
	22	+	-V, --version Print version information
22	23		```
23	24
24	25		<h2 align="center">Installation</h2>
		skipped 27 lines

■ ■ ■ ■ ■ ■

src/filters.rs

1	+	use crate::file_manager::File;
2	+	use crate::params::Params;
3	+
4	+	pub fn is_file_gt_minsize(app_opts: &Params, file: &File) -> bool {
5	+	match app_opts.get_minsize() {
6	+	Some(msize) => match file.size {
7	+	Some(fsize) => fsize >= msize,
8	+	None => true,
9	+	},
10	+	None => true,
11	+	}
12	+	}
13	+

■ ■ ■ ■ ■ ■

src/main.rs

		skipped 2 lines
3	3		mod output;
4	4		mod params;
5	5		mod scanner;
	6	+	mod filters;
6	7
7	8		use anyhow::Result;
8	9		use app::App;
		skipped 7 lines

■ ■ ■ ■ ■ ■

src/output.rs

		skipped 4 lines
5	5		use chrono::DateTime;
6	6		use colored::Colorize;
7	7		use dashmap::DashMap;
8		-	use humansize::{format_size, DECIMAL};
9	8		use itertools::Itertools;
10	9		use prettytable::{format, row, Table};
11	10		use std::io::Write;
		skipped 18 lines
30	29		Ok(format!("...{:<32}", display_range))
31	30		}
32	31
33		-	fn file_size(path: &String) -> Result<String> {
34		-	let mdata = fs::metadata(path)?;
35		-	let formatted_size = format!("{:>12}", format_size(mdata.len(), DECIMAL));
36		-	Ok(formatted_size)
	32	+	fn file_size(file: &File) -> Result<String> {
	33	+	Ok(format!("{:>12}", bytesize::ByteSize::b(file.size.unwrap())))
37	34		}
38	35
39	36		fn modified_time(path: &String) -> Result<String> {
		skipped 79 lines
119	116
120	117		pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
121	118		print_meta_info();
	119	+
	120	+	if duplicates.is_empty() {
	121	+	println!(
	122	+	"\n{}",
	123	+	"No duplicates found matching your search criteria.".green()
	124	+	);
	125	+	return;
	126	+	}
	127	+
122	128		duplicates
123	129		.clone()
124	130		.into_iter()
	131	+	.sorted_unstable_by_key(\|f\| {
	132	+	-(f.1.first().and_then(\|ff\| ff.size).unwrap_or_default() as i64)
	133	+	}) // sort by descending file size in interactive mode
125	134		.enumerate()
126	135		.for_each(\|(gindex, (_, group))\| {
127	136		let mut itable = Table::new();
		skipped 3 lines
131	140		itable.add_row(row![
132	141		index,
133	142		format_path(&file.path, opts).unwrap_or_default().blue(),
134		-	file_size(&file.path).unwrap_or_default().red(),
	143	+	file_size(&file).unwrap_or_default().red(),
135	144		modified_time(&file.path).unwrap_or_default().yellow()
136	145		]);
137	146		});
		skipped 5 lines
143	152		pub fn print(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
144	153		print_meta_info();
145	154
	155	+	if duplicates.is_empty() {
	156	+	println!(
	157	+	"\n{}",
	158	+	"No duplicates found matching your search criteria.".green()
	159	+	);
	160	+	return;
	161	+	}
	162	+
146	163		let mut output_table = Table::new();
147	164		output_table.set_titles(row!["hash", "duplicates"]);
148		-	duplicates.into_iter().for_each(\|(hash, group)\| {
149		-	let mut inner_table = Table::new();
150		-	inner_table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR);
151		-	group.iter().for_each(\|file\| {
152		-	inner_table.add_row(row![
153		-	format_path(&file.path, opts).unwrap_or_default().blue(),
154		-	file_size(&file.path).unwrap_or_default().red(),
155		-	modified_time(&file.path).unwrap_or_default().yellow()
156		-	]);
	165	+	duplicates
	166	+	.into_iter()
	167	+	.sorted_unstable_by_key(\|f\| f.1.first().and_then(\|ff\| ff.size).unwrap_or_default()) // sort by ascending size
	168	+	.for_each(\|(hash, group)\| {
	169	+	let mut inner_table = Table::new();
	170	+	inner_table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR);
	171	+	group.iter().for_each(\|file\| {
	172	+	inner_table.add_row(row![
	173	+	format_path(&file.path, opts).unwrap_or_default().blue(),
	174	+	file_size(&file).unwrap_or_default().red(),
	175	+	modified_time(&file.path).unwrap_or_default().yellow()
	176	+	]);
	177	+	});
	178	+	output_table.add_row(row![hash.green(), inner_table]);
157	179		});
158		-	output_table.add_row(row![hash.green(), inner_table]);
159		-	});
160	180
161	181		output_table.printstd();
162	182		}
		skipped 1 lines

■ ■ ■ ■ ■ ■

src/params.rs

1	1		use anyhow::{anyhow, Result};
2		-	use clap::Parser;
	2	+	use clap::{Parser, ValueHint};
3	3		use std::{fs, path::PathBuf};
4	4
5	5		#[derive(Parser, Debug)]
		skipped 3 lines
9	9		#[arg(short, long)]
10	10		pub types: Option<String>,
11	11		/// Run Deduplicator on dir different from pwd
12		-	#[arg(long)]
	12	+	#[arg(long, value_hint = ValueHint::DirPath)]
13	13		pub dir: Option<PathBuf>,
14	14		/// Delete files interactively
15	15		#[arg(long, short)]
16	16		pub interactive: bool,
	17	+	/// Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0]
	18	+	#[arg(long, short)]
	19	+	pub minsize: Option<String>,
17	20		}
18	21
19	22		impl Params {
	23	+	pub fn get_minsize(&self) -> Option<u64> {
	24	+	match &self.minsize {
	25	+	Some(msize) => match msize.parse::<bytesize::ByteSize>() {
	26	+	Ok(units) => Some(units.0),
	27	+	Err(_) => None,
	28	+	},
	29	+	None => None,
	30	+	}
	31	+	}
	32	+
20	33		pub fn get_directory(&self) -> Result<String> {
21	34		let dir_pathbuf: PathBuf = self
22	35		.dir
		skipped 29 lines

■ ■ ■ ■ ■ ■

src/scanner.rs

	1	+	use crate::{file_manager::File, filters, params::Params};
1	2		use anyhow::Result;
2	3		use dashmap::DashMap;
3	4		use fxhash::hash64 as hasher;
		skipped 3 lines
7	8		use rayon::prelude::*;
8	9		use std::hash::Hasher;
9	10		use std::{fs, path::PathBuf};
10		-
11		-	use crate::{file_manager::File, params::Params};
12	11
13	12		#[derive(Clone, Copy)]
14	13		enum IndexCritera {
		skipped 13 lines
28	27		.collect::<Vec<File>>();
29	28
30	29		if sizewize_duplicate_files.len() > 1 {
31		-	let size_wise_duplicate_paths = sizewize_duplicate_files
32		-	.into_par_iter()
33		-	.map(\|file\| file.path)
34		-	.collect::<Vec<String>>();
35		-
36		-	let hash_index_store = index_files(size_wise_duplicate_paths, IndexCritera::Hash)?;
	30	+	let hash_index_store = index_files(sizewize_duplicate_files, IndexCritera::Hash)?;
37	31		let duplicate_files = hash_index_store
38	32		.into_par_iter()
39	33		.filter(\|(_, files)\| files.len() > 1)
		skipped 5 lines
45	39		}
46	40		}
47	41
48		-	fn scan(app_opts: &Params) -> Result<Vec<String>> {
	42	+	fn scan(app_opts: &Params) -> Result<Vec<File>> {
49	43		let glob_patterns: Vec<PathBuf> = app_opts.get_glob_patterns();
50		-	let files: Vec<String> = glob_patterns
	44	+	let files: Vec<File> = glob_patterns
51	45		.par_iter()
52	46		.progress_with_style(ProgressStyle::with_template(
53	47		"{spinner:.green} [scanning files] [{wide_bar:.cyan/blue}] {pos}/{len} files",
		skipped 9 lines
63	57		})
64	58		.collect::<Vec<String>>()
65	59		})
	60	+	.map(\|file_path\| File {
	61	+	path: file_path.clone(),
	62	+	hash: None,
	63	+	size: Some(fs::metadata(file_path).unwrap().len()),
	64	+	})
	65	+	.filter(\|file\| filters::is_file_gt_minsize(app_opts, file))
66	66		.collect();
67	67
68	68		Ok(files)
69	69		}
70	70
71		-	fn process_file_size_index(fpath: String) -> Result<File> {
	71	+	fn process_file_hash_index(file: &File) -> Result<File> {
72	72		Ok(File {
73		-	path: fpath.clone(),
74		-	size: Some(fs::metadata(fpath)?.len()),
75		-	hash: None,
76		-	})
77		-	}
78		-
79		-	fn process_file_hash_index(fpath: String) -> Result<File> {
80		-	Ok(File {
81		-	path: fpath.clone(),
82		-	size: None,
83		-	hash: Some(hash_file(&fpath).unwrap_or_default()),
	73	+	path: file.path.clone(),
	74	+	size: file.size,
	75	+	hash: Some(hash_file(&file.path).unwrap_or_default()),
84	76		})
85	77		}
86	78
87	79		fn process_file_index(
88		-	fpath: String,
	80	+	file: File,
89	81		store: &DashMap<String, Vec<File>>,
90	82		index_criteria: IndexCritera,
91	83		) {
92	84		match index_criteria {
93	85		IndexCritera::Size => {
94		-	let processed_file = process_file_size_index(fpath).unwrap();
95	86		store
96		-	.entry(processed_file.size.unwrap_or_default().to_string())
97		-	.and_modify(\|fileset\| fileset.push(processed_file.clone()))
98		-	.or_insert_with(\|\| vec![processed_file]);
	87	+	.entry(file.size.unwrap_or_default().to_string())
	88	+	.and_modify(\|fileset\| fileset.push(file.clone()))
	89	+	.or_insert_with(\|\| vec![file]);
99	90		}
100	91		IndexCritera::Hash => {
101		-	let processed_file = process_file_hash_index(fpath).unwrap();
	92	+	let processed_file = process_file_hash_index(&file).unwrap();
102	93		let indexhash = processed_file.clone().hash.unwrap_or_default();
103	94
104	95		store
		skipped 5 lines
110	101		}
111	102
112	103		fn index_files(
113		-	files: Vec<String>,
	104	+	files: Vec<File>,
114	105		index_criteria: IndexCritera,
115	106		) -> Result<DashMap<String, Vec<File>>> {
116	107		let store: DashMap<String, Vec<File>> = DashMap::new();
		skipped 7 lines
124	115		Ok(store)
125	116		}
126	117
127		-	pub fn incremental_hashing(filepath: &str) -> Result<String> {
	118	+	fn incremental_hashing(filepath: &str) -> Result<String> {
128	119		let file = fs::File::open(filepath)?;
129	120		let fmap = unsafe { Mmap::map(&file)? };
130	121		let mut inchasher = fxhash::FxHasher::default();
		skipped 4 lines
135	126		Ok(format!("{}", inchasher.finish()))
136	127		}
137	128
138		-	pub fn standard_hashing(filepath: &str) -> Result<String> {
	129	+	fn standard_hashing(filepath: &str) -> Result<String> {
139	130		let file = fs::read(filepath)?;
140	131		Ok(hasher(&*file).to_string())
141	132		}
142	133
143		-	pub fn hash_file(filepath: &str) -> Result<String> {
	134	+	fn hash_file(filepath: &str) -> Result<String> {
144	135		let filemeta = fs::metadata(filepath)?;
145	136
146	137		// NOTE: USE INCREMENTAL HASHING ONLY FOR FILES > 100MB
		skipped 6 lines

Merge pull request #28 from sreedevk/development