Initial (and likely only) commit
authordusted <dusted@asuka.dusted.dk>
Wed, 7 Sep 2016 23:14:58 +0000 (01:14 +0200)
committerdusted <dusted@asuka.dusted.dk>
Wed, 7 Sep 2016 23:24:22 +0000 (01:24 +0200)
README [new file with mode: 0644]
main.c [new file with mode: 0644]

diff --git a/README b/README
new file mode 100644 (file)
index 0000000..8871d7c
--- /dev/null
+++ b/README
@@ -0,0 +1,50 @@
+Scan quickly through a file of checksums (32 bytes) and identify identical checksums.
+This program will allocate memory for the entire file, so it will fail or perform poorly on files larger than available memory.
+
+A list of checksums can be generated with: find -type f -exec md5sum {} \; > data.txt
+
+Program can be built with:
+  gcc main.c -o comparesums
+
+Program can be run with:
+  ./comparesums data.txt
+
+Example input:
+ 7ca6adad45f5fc6a668be5c0e8c6c576  main.c
+ d5ca4c947a77efaa6206e81b1f774cbc  a.out
+ bf247132afd50625c7eaa7c132ec91b8  README
+ 7ca6adad45f5fc6a668be5c0e8c6c576  test.c
+
+Example output:
+ Attempting to open fil.txt...
+ File is 163 bytes...
+ Read 163 bytes.
+ There are 4 linebreaks.
+ Allocated 4 places in 3 buckets..
+ Comparing...
+ -------------- Identical Files Found ----------------
+ 7ca6adad45f5fc6a668be5c0e8c6c576  main.c
+ 7ca6adad45f5fc6a668be5c0e8c6c576  test.c
+ ------------------- Comparison Done  ----------------
+ Did 1 comparisons.
+ Found 2 identical.
+
+Copyleft: WTFPL
+
+For a more serious tool, check "beyondcompare" or something else, this just fit what I wanted to do and was a fun way to spend a holday evening hour.
+
+Performance is reasonable (1.5 ghz machine):
+ Attempting to open data.txt...
+ File is 34748410 bytes...
+ Read 34748410 bytes.
+ There are 249119 linebreaks.
+ Allocated 249119 places in 62983 buckets..
+ Comparing...
+ [SNIPPED]
+ ------------------- Comparison Done  ----------------
+ Did 439760 comparisons.
+ Found 56233 identical.
+
+ real    0m0.371s
+ user    0m0.324s
+ sys     0m0.044s
diff --git a/main.c b/main.c
new file mode 100644 (file)
index 0000000..268dbe1
--- /dev/null
+++ b/main.c
@@ -0,0 +1,165 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+int fsize(char *filename) {
+    struct stat st; 
+
+    if (stat(filename, &st) == 0)
+        return st.st_size;
+
+    return -1; 
+}
+
+int main(int argc, char** argv) {
+       FILE *fp;
+       char *fn;
+       char* data;
+       int size=0;
+       int dataread=0;
+       int numbuckets[65536];
+       if( argc!= 2 ) {
+               puts("Required argument: FILE\n");
+               return(1);
+       }
+       fn=argv[1];
+       printf("Attempting to open %s...\n", fn);
+       size = fsize(fn);
+
+       if( size < 0 ) { 
+               puts("Could not get file size.\n");
+               return(1);
+       }
+
+       printf("File is %i bytes...\n", size);
+       
+
+       fp=fopen(fn, "r");
+       if(!fp) {
+               puts("Could not open the file..\n");
+               return(1);
+       }
+
+       data = malloc(size);
+
+       if(!data) {
+               puts("Could not allocate memory for the file...\n");
+               return(1);
+       }
+
+       dataread = fread(data, size, 1, fp);
+
+       if(dataread != 1) {
+               puts("Could not read data..\n");
+               return(1);
+       }
+
+       printf("Read %i bytes.\n", size);
+
+       int numLines=0;
+       int c=0;
+       
+       for(c=0; c < size; c++) {
+               if(data[c]=='\n')
+               {
+                       numLines++;
+               }
+       }
+       
+       printf("There are %i linebreaks.\n", numLines);
+
+       struct breaks {
+               int pos;
+               int buckidx;
+               char duped;
+       };
+       
+       struct breaks *linebreaks;
+       linebreaks = malloc( sizeof(struct breaks)*numLines );
+       memset(linebreaks, 0, sizeof(struct breaks)*numLines);
+
+       int lastbreak=0;
+       int idx=0;
+       int cc=0;
+       char lift;
+       memset(numbuckets, 0, sizeof(int)*(0xffff+1) );
+       for(c=0; c<size; c++) {
+               if(data[c]=='\n') {
+                       //Found a line
+                       linebreaks[idx].pos=lastbreak;
+                       //add count to bucket
+                       lift = data[lastbreak+4];
+                       data[lastbreak+4]=0;
+                       linebreaks[idx].buckidx=(int)strtol(data+lastbreak, NULL, 16);
+                       numbuckets[linebreaks[idx].buckidx ]++;
+                       data[lastbreak+4]=lift;
+                       lastbreak=c+1;
+                       data[c]=0;
+                       idx++;
+//                     printf("Line %i belongs in bucket %i\n", c, linebreaks[idx-1].buckidx);
+               }
+       }
+
+       struct breaks *bbuck[0xffff+1];
+       
+       int usedbuckets=0;
+       int usedplaces=0;
+       for(c=0; c< 0xffff+1; c++) {
+               if( numbuckets[c] ) {
+                       usedbuckets++;
+                       bbuck[c] = malloc( sizeof(struct breaks*)*numbuckets[c] );
+                       usedplaces += numbuckets[c];
+               }
+       }
+
+       printf("Allocated %i places in %i buckets..\n",usedplaces, usedbuckets);
+
+       memset( numbuckets,0, sizeof(int)*(0xffff+1) );
+
+       for(c=0; c < numLines; c++) {
+               cc = linebreaks[c].buckidx;
+               idx = numbuckets[cc];
+               numbuckets[cc]++;
+               bbuck[cc][idx] = linebreaks[c];
+//             printf("Assigning bucket %i place %i to line %i\n", cc, idx, c);
+       }
+       
+       int linepos;
+       char* str;
+       puts("Comparing...");
+       int numduped=0;
+       int numcomps=0;
+
+       for(c=0; c< 0xffff+1; c++) {
+               if( numbuckets[c] ) {
+                       //printf("Bucket %i should have %i elements...\n", c, numbuckets[c]);
+                       for(idx=0; idx < numbuckets[c]; idx++ ) {
+                               linepos=bbuck[c][idx].pos;
+                               str=data+linepos;
+                               //printf("Bucket %i, pos %i has linepos %i (%s)\n", c, idx, linepos, str);
+                               if( bbuck[c][idx].duped == 0 ) {
+                                       for(cc=idx+1; cc < numbuckets[c]; cc++) {
+                                               numcomps++;
+                                               if( memcmp( str, data+bbuck[c][cc].pos, 32 ) == 0 ) {
+                                                       if(bbuck[c][idx].duped==0) {
+                                                               bbuck[c][idx].duped=1;
+                                                               puts("-------------- Identical Files Found ----------------");
+                                                               puts( bbuck[c][idx].pos + data );
+                                                               numduped++;
+                                                       }
+                                                       puts( bbuck[c][cc].pos + data );
+                                                       bbuck[c][cc].duped=1;
+                                                       numduped++;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       puts("------------------- Comparison Done  ----------------");
+       printf("Did %i comparisons.\nFound %i identical.\n", numcomps, numduped);
+       free(linebreaks);
+       free(data);
+       fclose(fp);
+       return(0);
+}