Skip to content

Commit 302ef72

Browse files
author
Unbewohnte
committed
broom now can remove duplicates and create symlinks !; Updated README; Moved entry groups enum in the entry.hpp file; Broom seems to has become quite a useful tool !
1 parent f084552 commit 302ef72

File tree

8 files changed

+76
-63
lines changed

8 files changed

+76
-63
lines changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ broom [FLAGS..] [COMMAND] [DIRECTORY]
5757

5858
[COMMANDS]
5959

60-
- `sweep` -> scan for duplicate files, save results in a file and REMOVE empty files
60+
- `sweep` -> scan for duplicate files, REMOVE empty files and REPLACE other duplicates with symlinks
6161
- `scan` -> scan and save results in a file without removing anything [DEFAULT]
6262

6363

@@ -66,9 +66,9 @@ broom [FLAGS..] [COMMAND] [DIRECTORY]
6666
### Examples
6767

6868
- `broom scan -od . ~/homework`
69-
- `broom sweep ~/homework/I/have/a/lot/of/empty/files/here/for/some/reason`
69+
- `broom sweep ~/homework`
7070

71-
after the scan the results file will be saved in your current working directory, scan results file contains
71+
after the scan the results file will be saved in your current working directory, unless you specified it to be somewhere else. Scan results file contains
7272
a list of duplicate files that are grouped together so you can see EXACTLY WHERE each duplicate is in the filesystem.
7373

7474
---
@@ -80,4 +80,5 @@ GPLv3
8080

8181
## TODO
8282
- Make it go `P` A `R` A `L` L `E` L
83-
- Output approximate size that could be freed
83+
- ~~Output approximate size that could be freed~~
84+
- ~~Remove duplicates and create symlinks~~

build/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ endif()
2222

2323
set(EXECUTABLE_OUTPUT_PATH ../bin)
2424

25-
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp ../src/group.hpp)
25+
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp)
2626
target_link_libraries(broom Threads::Threads)

src/broom.cpp

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
2828

2929
#include "entry.hpp"
3030
#include "broom.hpp"
31-
#include "group.hpp"
3231

3332
namespace broom {
3433

@@ -52,15 +51,15 @@ std::vector<entry::Entry> Broom::track(const std::filesystem::path path) {
5251
);
5352

5453
for (auto dir_entry : std::filesystem::recursive_directory_iterator(path, options)) {
55-
if (!dir_entry.is_regular_file()) {
54+
if (!dir_entry.is_regular_file() || std::filesystem::is_symlink(dir_entry.path())) {
5655
// skip everything that we cannot process so easily
5756
continue;
5857
};
5958

6059
entry::Entry entry(dir_entry.path());
6160
tracked_entries.push_back(entry);
6261
}
63-
} else if (std::filesystem::is_regular_file(path)) {
62+
} else if (std::filesystem::is_regular_file(path) && !std::filesystem::is_symlink(path)) {
6463
// just a file
6564
entry::Entry entry(path);
6665
tracked_entries.push_back(entry);
@@ -186,7 +185,7 @@ uintmax_t Broom::find_empty_files(std::vector<entry::Entry>& tracked_entries) {
186185
for (entry::Entry& entry : tracked_entries) {
187186
if (entry.filesize == 0) {
188187
// empty files can`t be considered as duplicates. assign a group
189-
entry.group = group::EMPTY;
188+
entry.group = entry::EMPTY;
190189
found_empty_files++;
191190
}
192191
}
@@ -199,7 +198,7 @@ uintmax_t Broom::remove_empty_files(std::vector<entry::Entry>& tracked_entries)
199198
uintmax_t removed = 0;
200199

201200
tracked_entries.erase(std::remove_if(tracked_entries.begin(), tracked_entries.end(), [&removed](entry::Entry& entry) -> bool {
202-
if (entry.group == group::EMPTY) {
201+
if (entry.group == entry::EMPTY) {
203202
try {
204203
entry.remove();
205204
removed++;
@@ -219,11 +218,11 @@ uintmax_t Broom::remove_empty_files(std::vector<entry::Entry>& tracked_entries)
219218
// marks every entry without any group as a duplicate
220219
void Broom::mark_as_duplicates(std::vector<entry::Entry>& tracked_entries) {
221220
for (entry::Entry& entry : tracked_entries) {
222-
if (entry.group == group::EMPTY) {
221+
if (entry.group == entry::EMPTY) {
223222
// do not mess up grouping
224223
continue;
225224
}
226-
entry.group = group::DUPLICATE;
225+
entry.group = entry::DUPLICATE;
227226
}
228227
};
229228

@@ -252,4 +251,33 @@ std::map<std::string, std::vector<entry::Entry>> Broom::group_duplicates(std::ve
252251
return duplicate_groups;
253252
};
254253

254+
// REMOVES every duplicate file in a group except the first one and creates symlinks pointing to the
255+
// first remaining real file
256+
void Broom::remove_duplicates_make_symlinks(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates) {
257+
for (const auto& record : grouped_duplicates) {
258+
unsigned int i = 0;
259+
std::filesystem::path original_file_path;
260+
261+
for (const auto& duplicate_entry : record.second) {
262+
if (i == 0) {
263+
// the first duplicate in the group. Save it
264+
original_file_path = duplicate_entry.path;
265+
} else {
266+
// not the first entry; REMOVE it and create a symlink,
267+
// pointing to the real file
268+
std::filesystem::path removed_duplicate_path = duplicate_entry.path;
269+
try {
270+
// remove the entry
271+
duplicate_entry.remove();
272+
// make a symlink
273+
std::filesystem::create_symlink(original_file_path, removed_duplicate_path);
274+
} catch(...) {}
275+
}
276+
277+
// serves only the first iteration. It doesn`t matter if it is not incremented after that
278+
i++;
279+
}
280+
}
281+
};
282+
255283
}

src/broom.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class Broom {
6161
// string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES
6262
std::map<std::string, std::vector<entry::Entry>> group_duplicates(std::vector<entry::Entry>& tracked_entries);
6363

64+
// REMOVES every duplicate file in a group except the first one and creates symlinks pointing to the
65+
// first remaining real file
66+
void remove_duplicates_make_symlinks(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates);
67+
6468
// creates a list of duplicate, empty files and puts it into a file
6569
void create_scan_results_list(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt");
6670
};

src/entry.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ void Entry::get_pieces() {
9595
};
9696

9797
// Remove entry from the disk
98-
void Entry::remove() {
98+
void Entry::remove() const {
9999
std::filesystem::remove(path);
100100
};
101101

src/entry.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,14 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
2626
#include <iomanip>
2727
#include <string>
2828

29-
#include "group.hpp"
30-
3129

3230
namespace entry {
31+
32+
enum Group {
33+
DUPLICATE,
34+
EMPTY,
35+
};
36+
3337
// 3 pieces (beginning, middle and end of the file)
3438
const uint8_t PIECE_SIZE = 75;
3539
const uint8_t PIECES_AMOUNT = 3;
@@ -40,7 +44,7 @@ class Entry {
4044
std::filesystem::path path; // set via constructor
4145
uintmax_t filesize; // set via constructor
4246
std::string pieces; // 3 hex-represented pieces of file; set only via a method call to not stress the disk
43-
group::Group group; // set externally
47+
Group group; // set externally
4448

4549
Entry(const std::filesystem::path entry_path);
4650
~Entry();
@@ -51,7 +55,7 @@ class Entry {
5155
void get_pieces();
5256

5357
// REMOVE entry from the disk
54-
void remove();
58+
void remove() const;
5559
};
5660

5761
}

src/group.hpp

Lines changed: 0 additions & 33 deletions
This file was deleted.

src/main.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
2828
#include "broom.hpp"
2929

3030
// Broom version number
31-
#define VERSION "v0.2.3"
31+
#define VERSION "v0.3.0"
3232

3333
void print_help() {
3434
std::cout
@@ -39,7 +39,7 @@ void print_help() {
3939
<< "-od | --output-directory -> path to the directory to save results file in" << std::endl << std::endl
4040

4141
<< "[COMMANDS]" << std::endl
42-
<< "sweep -> scan for duplicate files, save results in a file and REMOVE empty files" << std::endl
42+
<< "sweep -> scan for duplicate files, REMOVE empty files and REPLACE other duplicates with symlinks" << std::endl
4343
<< "scan -> scan and save results in a file without removing anything [DEFAULT]" << std::endl << std::endl
4444

4545
<< "[DIRECTORY]" << std::endl
@@ -170,26 +170,35 @@ int main(int argc, char* argv[]) {
170170

171171
std::cout << "[INFO] " << tracked_entries.size() << " files left being tracked" << std::endl;
172172

173-
auto grouped_duplicates = broom.group_duplicates(tracked_entries);
174-
175-
if (grouped_duplicates.size() == 0) {
173+
if (tracked_entries.size() == 0) {
174+
// No duplicates at all !
176175
std::cout << "[INFO] Nothing I can help with ! Congratulations !" << std::endl;
177176
return 0;
178177
}
179178

180-
// now only files with a non-unique size and contents are being tracked
181-
// are they REALLY duplicates ?
182-
// better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge
183-
broom.create_scan_results_list(grouped_duplicates, results_file_dir_path);
184-
std::cout << "[INFO] Created scan results file" << std::endl;
179+
// make duplicate groups from all this mess that tracked_entries right now are
180+
auto grouped_duplicates = broom.group_duplicates(tracked_entries);
185181

186-
// output a little information about how much space could be freed if every duplicate
187-
// in the group will be deleted but one
188182
double could_be_freed = 0;
189183
for (auto& record : grouped_duplicates) {
190184
could_be_freed += record.second[0].filesize * (record.second.size() - 1);
191185
}
192-
std::cout <<"[INFO] " << could_be_freed / 1024 / 1024 << " MB could be freed" << std::endl;
186+
187+
if (!sweeping) {
188+
// output a little information about how much space could be freed if every duplicate
189+
// in the group will be deleted but one
190+
std::cout <<"[INFO] " << could_be_freed / 1024 / 1024 << " MB could be freed" << std::endl;
191+
192+
broom.create_scan_results_list(grouped_duplicates, results_file_dir_path);
193+
std::cout << "[INFO] Created scan results file" << std::endl;
194+
195+
} else {
196+
// remove duplicates and create symlinks
197+
std::cout << "[INFO] Removing duplicates and creating symlinks..." << std::endl;
198+
broom.remove_duplicates_make_symlinks(grouped_duplicates);
199+
200+
std::cout <<"[INFO] Freed approximately " << could_be_freed / 1024 / 1024 << " MB (May be incorrect)" << std::endl;
201+
}
193202

194203
} catch(const std::exception& e) {
195204
std::cerr

0 commit comments

Comments
 (0)