Updated readme.
[md5sumpare.git] / main.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <sys/stat.h>
4 #include <string.h>
5
6 int fsize(char *filename) {
7     struct stat st; 
8
9     if (stat(filename, &st) == 0)
10         return st.st_size;
11
12     return -1; 
13 }
14
15 int main(int argc, char** argv) {
16         FILE *fp;
17         char *fn;
18         char* data;
19         int size=0;
20         int dataread=0;
21         int numbuckets[65536];
22         if( argc!= 2 ) {
23                 puts("Required argument: FILE\n");
24                 return(1);
25         }
26         fn=argv[1];
27         printf("Attempting to open %s...\n", fn);
28         size = fsize(fn);
29
30         if( size < 0 ) { 
31                 puts("Could not get file size.\n");
32                 return(1);
33         }
34
35         printf("File is %i bytes...\n", size);
36         
37
38         fp=fopen(fn, "r");
39         if(!fp) {
40                 puts("Could not open the file..\n");
41                 return(1);
42         }
43
44         data = malloc(size);
45
46         if(!data) {
47                 puts("Could not allocate memory for the file...\n");
48                 return(1);
49         }
50
51         dataread = fread(data, size, 1, fp);
52
53         if(dataread != 1) {
54                 puts("Could not read data..\n");
55                 return(1);
56         }
57
58         printf("Read %i bytes.\n", size);
59
60         int numLines=0;
61         int c=0;
62         
63         for(c=0; c < size; c++) {
64                 if(data[c]=='\n')
65                 {
66                         numLines++;
67                 }
68         }
69         
70         printf("There are %i linebreaks.\n", numLines);
71
72         struct breaks {
73                 int pos;
74                 int buckidx;
75                 char duped;
76         };
77         
78         struct breaks *linebreaks;
79         linebreaks = malloc( sizeof(struct breaks)*numLines );
80         memset(linebreaks, 0, sizeof(struct breaks)*numLines);
81
82         int lastbreak=0;
83         int idx=0;
84         int cc=0;
85         char lift;
86         memset(numbuckets, 0, sizeof(int)*(0xffff+1) );
87         for(c=0; c<size; c++) {
88                 if(data[c]=='\n') {
89                         //Found a line
90                         linebreaks[idx].pos=lastbreak;
91                         //add count to bucket
92                         lift = data[lastbreak+4];
93                         data[lastbreak+4]=0;
94                         linebreaks[idx].buckidx=(int)strtol(data+lastbreak, NULL, 16);
95                         numbuckets[linebreaks[idx].buckidx ]++;
96                         data[lastbreak+4]=lift;
97                         lastbreak=c+1;
98                         data[c]=0;
99                         idx++;
100                 }
101         }
102
103         struct breaks **bbuck[0xffff+1];
104         
105         int usedbuckets=0;
106         int usedplaces=0;
107         for(c=0; c< 0xffff+1; c++) {
108                 if( numbuckets[c] ) {
109                         usedbuckets++;
110                         bbuck[c] = malloc( sizeof(struct breaks*)*numbuckets[c] );
111                         usedplaces += numbuckets[c];
112                 }
113         }
114
115         printf("Allocated %i places in %i buckets..\n",usedplaces, usedbuckets);
116
117         memset( numbuckets,0, sizeof(int)*(0xffff+1) );
118
119         for(c=0; c < numLines; c++) {
120                 cc = linebreaks[c].buckidx;
121                 idx = numbuckets[cc];
122                 numbuckets[cc]++;
123                 bbuck[cc][idx] = &linebreaks[c];
124 //              printf("Assigning bucket %i place %i to line %i\n", cc, idx, c);
125         }
126         
127         int linepos;
128         char* str;
129         puts("Comparing...");
130         int numduped=0;
131         int numcomps=0;
132
133         for(c=0; c< 0xffff+1; c++) {
134                 if( numbuckets[c] ) {
135                         //printf("Bucket %i should have %i elements...\n", c, numbuckets[c]);
136                         for(idx=0; idx < numbuckets[c]; idx++ ) {
137                                 linepos=bbuck[c][idx]->pos;
138                                 str=data+linepos;
139                                 //printf("Bucket %i, pos %i has linepos %i (%s)\n", c, idx, linepos, str);
140                                 if( bbuck[c][idx]->duped == 0 ) {
141                                         for(cc=idx+1; cc < numbuckets[c]; cc++) {
142                                                 numcomps++;
143                                                 if( memcmp( str, data+bbuck[c][cc]->pos, 32 ) == 0 ) {
144                                                         if(bbuck[c][idx]->duped==0) {
145                                                                 bbuck[c][idx]->duped=1;
146                                                                 puts("-------------- Identical Files Found ----------------");
147                                                                 puts( bbuck[c][idx]->pos + data );
148                                                                 numduped++;
149                                                         }
150                                                         puts( bbuck[c][cc]->pos + data );
151                                                         bbuck[c][cc]->duped=1;
152                                                         numduped++;
153                                                 }
154                                         }
155                                 }
156                         }
157                 }
158         }
159         puts("------------------- Comparison Done  ----------------");
160         printf("Did %i comparisons.\nFound %i identical.\n", numcomps, numduped);
161
162         for(c=0; c< 0xffff+1; c++) {
163                 if(numbuckets[c]) {
164                         free(bbuck[c]);
165                 }
166         }
167         free(linebreaks);
168         free(data);
169         fclose(fp);
170         return(0);
171 }