Articles   Members Online: 3
-Article/Tip Search
-News Group Search over 21 Million news group articles.
-Delphi/Pascal
-CBuilder/C++
-C#Builder/C#
-JBuilder/Java
-Kylix
Member Area
-Home
-Account Center
-Top 10 NEW!!
-Submit Article/Tip
-Forums Upgraded!!
-My Articles
-Edit Information
-Login/Logout
-Become a Member
-Why sign up!
-Newsletter
-Chat Online!
-Indexes NEW!!
Employment
-Build your resume
-Find a job
-Post a job
-Resume Search
Contacts
-Contacts
-Feedbacks
-Link to us
-Privacy/Disclaimer
Embarcadero
Visit Embarcadero
Embarcadero Community
JEDI
Links
How to search for duplicate files on large drives Turn on/off line numbers in source code. Switch to Orginial background IDE or DSP color Comment or reply to this aritlce/tip for discussion. Bookmark this article to my favorite article(s). Print this article
21-Dec-02
Category
Files Operation
Language
Delphi 2.x
Views
164
User Rating
No Votes
# Votes
0
Replies
0
Publisher:
DSP, Administrator
Reference URL:
DKB
			Author: Tomas Rutkauskas

I have a list of items that I will need to deal with (looking for duplicate files 
on a drive) that can be upto 2 to 5 million in size. So I will be populating this 
list and then searching it over and over and over. Normally when dealing with 
smaller lists like this I would simply use a TStringList and attach an object. 
However, this seems a little large for TStringList and the main reason, the 
searching with IndexOf, I don't think is reason enough to use it. So what I am 
looking for is a list of some sort (TObjectList ?) that is fast and good to deal 
with and can easily handle this size of entries. It would be really nice it there 
was a way to create multiple indexes into the in-memory list, as that would greatly 
speed up my processing of the information. The best solution would be an in-memory 
database of some sort (at least I think it would be), but my issue with in-memory 
databases is that of the String sizes for File Name and directory. If I use a 
regular String variable in an object, then the size can be variable. If I use a 
standard DB field, then the size in the ones I have seen are all static. So I have 
to define a huge field to handle all file names which wastes space on all other 
entries in the list. Any thoughts as the best container to handle this sort of 
thing?

Answer:
1   
2   {$A+,B-,D-,E+,F-,G+,I-,L-,N-,O-,P-,Q-,R-,S-,T-,V-,X-,Y-}
3   
4   program FindDup;
5   
6   {$APPTYPE CONSOLE} // 2002-12-16 - Delphi 32 console app
7   
8   {Usage: FINDDUP [D:]
9   
10  This program finds all duplicate files on the specified drive. The algorithm uses a 
11  hash table where each hash value contains a linked list of files that match a given 
12  hash. Every file found is inserted into the hash table linked list unless it's 
13  already there and therefore a duplicate. All duplicates are added to a red-black 
14  tree, where each node contains the file ame and a pointer to a linked list of 
15  records containing size, date, and path information.
16  
17  This program can be compiled in either real or protected modes. Protected mode will 
18  allow it to search larger drives. Informal benchmarks on a 1.6 Gig drive with > 900 
19  directories, Pentium 90, compiled for p-mode:
20  
21  TEST 16 sec. (simple do-nothing recursive dir searcher)
22  FINDDUP 18 sec. - 16 = 2 sec.
23  REP-DPMI 36 sec. - 16 = 20 sec.
24  
25  Subtracting the overhead of FindFirst/ FindNext, this program runs in about 2 
26  seconds as compared to REP-DPMI's 20 seconds. Without ignoring the overhead, it's 
27  still twice as fast. Please note that this program is pure pascal.}
28  
29  {$DEFINE DRBOB} // Do not search hidden directories or find hidden files
30  {.$DEFINE SAFE}// Checks memory allocations for out of memory conditions
31  {.$DEFINE CLEANUP}// Frees allocated memory - slows things down a bit
32  
33  uses
34    SysUtils;
35  
36  const
37    DosDelimSet: set of Char = ['\', ':', #0];
38    MaxHash = 16381; {largest prime number < 16K}
39  
40  type
41    PathStr = string;
42    NameString = string[12];
43    St2 = string[2];
44  
45    pPath = ^PathRecord; {we will only keep one copy of each unique directory path}
46    PathRecord = record
47      Next: pPath;
48      Path: PathStr;
49    end;
50  
51    pDataRec = ^DataRec; {detailed information unique to each dupliacte}
52    DataRec = record
53      Time: longint;
54      Size: longint;
55      Path: pPath;
56      Next: pDataRec;
57    end;
58  
59    link = ^RBTreeNode;
60    RBTreeNode = record {Red/ black tree node}
61      Key: NameString; {Name of file}
62      red: boolean;
63      l, r: link;
64      DataP: pDataRec; {linked list of detail information}
65    end;
66  
67    pFileRec = ^MyFileRec;
68    MyFileRec = record
69      {Hash table record. There will only be one record for each duplicate file name}
70      Name: NameString;
71      Time: longint;
72      Size: longint;
73      Path: pPath;
74      Node: link; {let's quickly insert duplicate into tree}
75      next: pFileRec; {next record in linked list}
76    end;
77  
78    tHashTable = array[0..MAXHASH] of pFileRec;
79  
80  var
81    Head, z: link;
82    HashTable: ^tHashTable;
83    PathHead, TempPathHead: pPath;
84    OldName: NameString;
85  
86  procedure RBTreeInitialize;
87  {Initialize red/ black tree}
88  begin
89    New(Z);
90  {$IFDEF SAFE}
91    if Z = nil then
92      exit;
93  {$ENDIF}
94    z^.l := z;
95    z^.r := z;
96    z^.red := false;
97    New(Head);
98  {$IFDEF SAFE}
99    if Head = nil then
100     exit;
101 {$ENDIF}
102   Head^.r := z;
103   Head^.l := z;
104   Head^.Key := '';
105   Head^.Red := false;
106 end;
107 
108 function Rotate(const Value: MyFileRec; y: link): link;
109 var
110   c, gc: link;
111 begin
112   if Value.Name < Y^.Key then
113     c := y^.l
114   else
115     c := y^.r;
116   if Value.Name < c^.Key then
117   begin
118     gc := c^.l;
119     c^.l := gc^.r;
120     gc^.r := c;
121   end
122   else
123   begin
124     gc := c^.r;
125     c^.r := gc^.l;
126     gc^.l := c;
127   end;
128   if Value.Name < Y^.Key then
129     y^.l := gc
130   else
131     y^.r := gc;
132   Rotate := gc;
133 end;
134 
135 function Split(const Value: MyFileRec; gg, g, p, x: link): link;
136 begin
137   x^.red := true;
138   x^.l^.red := false;
139   x^.r^.red := false;
140   if p^.red then
141   begin
142     g^.red := true;
143     if (Value.Name < g^.Key) <> (Value.Name < p^.Key) then
144       p := Rotate(Value, g);
145     x := rotate(Value, gg);
146     x^.red := false;
147   end;
148   Head^.r^.red := false;
149   split := x;
150 end;
151 
152 function RBTreeInsert(const Value: MyFileRec; x: link): link;
153 {Insert file record into red/ black tree}
154 var
155   gg, g, p: link;
156 begin
157   p := x;
158   g := x;
159   repeat
160     gg := g;
161     g := p;
162     p := x;
163     if Value.Name < x^.Key then
164       x := x^.l
165     else
166       x := x^.r;
167     if x^.l^.red and x^.r^.red then
168       x := split(Value, gg, g, p, x);
169   until
170     x = z;
171   new(x);
172 {$IFDEF SAFE}
173   if x = nil then
174     exit;
175 {$ENDIF}
176   x^.Key := Value.Name;
177   New(x^.DataP);
178 {$IFDEF SAFE}
179   if x^.DataP = nil then
180     exit;
181 {$ENDIF}
182   x^.DataP^.Next := nil;
183   x^.DataP^.Time := Value.Time;
184   x^.DataP^.Size := Value.Size;
185   x^.DataP^.Path := Value.Path;
186   x^.l := z;
187   x^.r := z;
188   if Value.Name < p^.Key then
189     p^.l := x
190   else
191     p^.r := x;
192   RbTreeInsert := x;
193   x := Split(Value, gg, g, p, x);
194 end;
195 
196 procedure Traverse(p: link);
197 {Traverse red/ black tree, printing out results}
198 var
199   TempQ, q: pDataRec;
200 begin
201   if (p^.l <> z) and (p^.l <> nil) then
202     Traverse(p^.l);
203   if (p <> head) then
204   begin
205     if p^.Key <> OldName then
206     begin
207       OldName := p^.Key;
208       writeln(OldName);
209     end;
210     q := p^.DataP;
211     while q <> nil do
212     begin
213       with q^ do
214         writeln(size: 10, '   ', FormatDateTime('yyyy-mm-dd hh:nn:ss',
215           FileDateToDateTime(Time)), '   ', Path^.Path);
216 {$IFDEF CLEANUP}
217       TempQ := q;
218       q := q^.Next;
219       Dispose(TempQ);
220 {$ELSE}
221       q := q^.Next;
222 {$ENDIF}
223     end;
224     writeln;
225   end;
226   if (p^.r <> z) and (p^.r <> nil) then
227     Traverse(p^.r);
228 {$IFDEF CLEANUP}
229   Dispose(p);
230 {$ENDIF}
231 end;
232 
233 function AddBackSlash(const DirName: string): string;
234 {Add a default backslash to a directory name}
235 begin
236   if DirName[Length(DirName)] in DosDelimSet then
237     AddBackSlash := DirName
238   else
239     AddBackSlash := DirName + '\';
240 end;
241 
242 function Hash(const Key: NameString): word;
243 var
244   h: word;
245   j: integer;
246   Len: integer;
247 begin
248   Len := Length(Key);
249   h := ord(Key[1]);
250   for j := 2 to Len do
251   begin
252     h := ((h * 32) + Ord(Key[j])) mod MAXHASH;
253   end;
254   Hash := h;
255 end;
256 
257 procedure Add(var SR: tSearchRec; DirP: pPath);
258 {Add a new search record/ path to the hash table}
259 var
260   p, q, r: pFileRec;
261   h: word;
262   TempData: pDataRec;
263 begin
264   h := Hash(SR.Name);
265   New(r);
266 {$IFDEF SAFE}
267   if r = nil then
268     exit;
269 {$ENDIF}
270   r^.Name := SR.Name;
271   r^.Time := SR.Time;
272   r^.Size := SR.Size;
273   r^.Path := DirP;
274   r^.Next := nil;
275   r^.Node := nil;
276   p := HashTable^[H];
277   if p = nil then
278   begin {Hash slot not used}
279     HashTable^[h] := r;
280   end
281   else
282   begin
283     q := p;
284     while (p <> nil) and (p^.Name < SR.Name) do
285     begin
286       q := p;
287       p := p^.Next;
288     end;
289     if (p <> nil) and (p^.Name = SR.Name) then
290     begin {Found duplicate file}
291       if p^.Node = nil then
292       begin {was not already in tree}
293         p^.Node := RBTreeInsert(p^, Head);
294           {save link so we don't have to search tree next time}
295       end;
296       New(TempData);
297 {$IFDEF SAFE}
298       if TempData = nil then
299         exit;
300 {$ENDIF}
301       TempData^.Time := Sr.Time;
302       TempData^.Size := Sr.Size;
303       TempData^.Path := DirP;
304       TempData^.Next := p^.Node^.DataP; {Add to linked list on tree node}
305       p^.Node^.DataP := TempData;
306       Dispose(r); {didn't need it after all}
307     end
308     else
309     begin {Not a duplicate}
310       if p = q then
311       begin {Add at start of linked list}
312         HashTable^[H] := r;
313         r^.Next := P;
314       end
315       else
316       begin {Insert into linked list}
317         q^.Next := r;
318         r^.Next := p;
319       end;
320     end;
321   end;
322 end;
323 
324 procedure Find(const Path: PathStr);
325 {Recursive file/directory searcher}
326 var
327   Sr: tSearchRec;
328   DirP: pPath;
329   r: integer;
330 begin
331   New(DirP);
332 {$IFDEF SAFE}
333   if DirP = nil then
334     exit;
335 {$ENDIF}
336   DirP^.Path := Path;
337   DirP^.Next := PathHead;
338   PathHead := DirP;
339   r := FindFirst(AddBackSlash(Path) + '*.*', faAnyFile, Sr);
340   while r = 0 do
341   begin
342 {$IFDEF DRBOB} {only do non-hidden directories}
343     if ((Sr.Attr and faDirectory) <> 0) and ((Sr.Attr and faHidden) = 0) then
344     begin
345 {$ELSE} {do them all}
346     if (Sr.Attr and Directory) <> 0 then
347     begin
348 {$ENDIF}
349       if Sr.Name[1] <> '.' then
350         Find(AddBackSlash(Path) + Sr.Name);
351       r := 0;
352     end
353     else
354     begin
355 {$IFDEF DRBOB}
356       if (Sr.Attr and faHidden) = 0 then {Only do non-hidden files}
357 {$ENDIF}
358         Add(Sr, DirP);
359     end;
360     r := FindNext(Sr);
361   end;
362 end;
363 
364 function HeapFunc(Size: Word): Integer; far;
365 begin
366   HeapFunc := 1;
367 end;
368 
369 procedure Init;
370 begin
371   OldName := '';
372   PathHead := nil;
373 {$IFDEF SAFE}
374   HeapError := @HeapFunc;
375 {$ENDIF}
376   New(HashTable);
377 {$IFDEF SAFE}
378   if HashTable = nil then
379     halt;
380 {$ENDIF}
381   FillChar(HashTable^, sizeof(HashTable^), 0);
382   RBTreeInitialize;
383 end;
384 
385 procedure Process;
386 begin
387   Find(ParamStr(1) + '\');
388   Traverse(Head);
389 end;
390 
391 procedure Done;
392 var
393   i: integer;
394   q, tempq: pFileRec;
395 begin
396 {$IFDEF CLEANUP}
397   Dispose(Z);
398   for i := 0 to MAXHASH - 1 do
399   begin
400     if HashTable^[i] <> nil then
401     begin
402       q := HashTable^[i];
403       while q <> nil do
404       begin
405         tempq := q^.next;
406         Dispose(q);
407         q := tempq;
408       end;
409     end;
410   end;
411   Dispose(HashTable);
412   TempPathHead := PathHead;
413   while PathHead <> nil do
414   begin
415     TempPathHead := PathHead^.Next;
416     FreeMem(PathHead, Length(PathHead^.Path) + 5);
417     PathHead := TempPathHead;
418   end;
419 {$ENDIF}
420 end;
421 
422 begin
423   Init;
424   Process;
425   Done;
426 end.


			
Vote: How useful do you find this Article/Tip?
Bad Excellent
1 2 3 4 5 6 7 8 9 10

 

Advertisement
Share this page
Advertisement
Download from Google

Copyright © Mendozi Enterprises LLC