Articles   Members Online:
-Article/Tip Search
-News Group Search over 21 Million news group articles.
-Delphi/Pascal
-CBuilder/C++
-C#Builder/C#
-JBuilder/Java
-Kylix
Member Area
-Home
-Account Center
-Top 10 NEW!!
-Submit Article/Tip
-Forums Upgraded!!
-My Articles
-Edit Information
-Login/Logout
-Become a Member
-Why sign up!
-Newsletter
-Chat Online!
-Indexes NEW!!
Employment
-Build your resume
-Find a job
-Post a job
-Resume Search
Contacts
-Contacts
-Feedbacks
-Link to us
-Privacy/Disclaimer
Embarcadero
Visit Embarcadero
Embarcadero Community
JEDI
Links
How to Implement fuzzy search Turn on/off line numbers in source code. Switch to Orginial background IDE or DSP color Comment or reply to this aritlce/tip for discussion. Bookmark this article to my favorite article(s). Print this article
30-Aug-02
Category
Algorithm
Language
Delphi All Versions
Views
148
User Rating
No Votes
# Votes
0
Replies
0
Publisher:
DSP, Administrator
Reference URL:
DKB
			Author: Tomas Rutkauskas

How to implement fuzzy search

Answer:

Solve 1:

This DLL calculates the Levenshtein Distance between two strings. Please note that 
ShareMem must be the first unit in the Uses clause of the Interface section of your 
unit, if your DLL exports procedures or functions, which pass string parameters or 
function results. ShareMem is the interface to delphimm.dll, which you have to 
distribute together with your own DLL. To avoid using delphimm.dll, pass string 
parameters by using PChar or ShortString parameters.

1   library Levensh;
2   
3   uses
4     ShareMem, SysUtils;
5   
6   var
7     FiR0: integer;
8     FiP0: integer;
9     FiQ0: integer;
10  
11  function Min(X, Y, Z: Integer): Integer;
12  begin
13    if (X < Y) then
14      Result := X
15    else
16      Result := Y;
17    if (Result > Z) then
18      Result := Z;
19  end;
20  
21  procedure LevenshteinPQR(p, q, r: integer);
22  begin
23    FiP0 := p;
24    FiQ0 := q;
25    FiR0 := r;
26  end;
27  
28  function LevenshteinDistance(const sString, sPattern: string): Integer;
29  const
30    MAX_SIZE = 50;
31  var
32    aiDistance: array[0..MAX_SIZE, 0..MAX_SIZE] of Integer;
33    i, j, iStringLength, iPatternLength, iMaxI, iMaxJ: Integer;
34    chChar: Char;
35    iP, iQ, iR, iPP: Integer;
36  begin
37    iStringLength := length(sString);
38    if (iStringLength > MAX_SIZE) then
39      iMaxI := MAX_SIZE
40    else
41      iMaxI := iStringLength;
42    iPatternLength := length(sPattern);
43    if (iPatternLength > MAX_SIZE) then
44      iMaxJ := MAX_SIZE
45    else
46      iMaxJ := iPatternLength;
47    aiDistance[0, 0] := 0;
48    for i := 1 to iMaxI do
49      aiDistance[i, 0] := aiDistance[i - 1, 0] + FiR0;
50    for j := 1 to iMaxJ do
51    begin
52      chChar := sPattern[j];
53      if ((chChar = '*') or (chChar = '?')) then
54        iP := 0
55      else
56        iP := FiP0;
57      if (chChar = '*') then
58        iQ := 0
59      else
60        iQ := FiQ0;
61      if (chChar = '*') then
62        iR := 0
63      else
64        iR := FiR0;
65      aiDistance[0, j] := aiDistance[0, j - 1] + iQ;
66      for i := 1 to iMaxI do
67      begin
68        if (sString[i] = sPattern[j]) then
69          iPP := 0
70        else
71          iPP := iP;
72        {aiDistance[i, j] := Minimum of 3 values}
73        aiDistance[i, j] := Min(aiDistance[i - 1, j - 1] + iPP,
74          aiDistance[i, j - 1] + iQ,
75          aiDistance[i - 1, j] + iR);
76      end;
77    end;
78    Result := aiDistance[iMaxI, iMaxJ];
79  end;
80  
81  exports
82    LevenshteinDistance Index 1,
83    LevenshteinPQR Index 2;
84  
85  begin
86    FiR0 := 1;
87    FiP0 := 1;
88    FiQ0 := 1;
89  end.



Solve 2:

This is an old Pascal code snippet, which is based on a C project published in the 
C't magazine somewhen back in the 1990's. Can't remember where I found it on the 
WWW. Please note that the code below accesses a simple *.txt file to search in.

90  program FuzzySearch;
91  {Translation from C to Pascal by Karsten Paulini and Simon Reinhardt}
92  const
93    MaxParLen = 255;
94  var
95    InFile: Text;
96    Filename: string;
97    InputStr: string;
98    SearchStr: string;
99    Treshold: Integer;
100 
101 function PrepareTheString(OriginStr: string; var ConvStr: string): Integer;
102 var
103   i: Integer;
104 begin
105   ConvStr := OriginStr;
106   for i := 1 to Length(OriginStr) do
107   begin
108     ConvStr[i] := UpCase(ConvStr[i]);
109     if ConvStr[i] < '0' then
110       ConvStr[i] := ' '
111     else
112       case ConvStr[i] of
113         Chr(196): ConvStr[i] := Chr(228);
114         Chr(214): ConvStr[i] := Chr(246);
115         Chr(220): ConvStr[i] := Chr(252);
116         Chr(142): ConvStr[i] := Chr(132);
117         Chr(153): ConvStr[i] := Chr(148);
118         Chr(154): ConvStr[i] := Chr(129);
119         ':': ConvStr[i] := ' ';
120         ';': ConvStr[i] := ' ';
121         '<': ConvStr[i] := ' ';
122         '>': ConvStr[i] := ' ';
123         '=': ConvStr[i] := ' ';
124         '?': ConvStr[i] := ' ';
125         '[': ConvStr[i] := ' ';
126         ']': ConvStr[i] := ' ';
127       end;
128   end;
129   PrepareTheString := i;
130 end;
131 
132 function NGramMatch(TextPara, SearchStr: string; SearchStrLen, NGramLen: Integer;
133   var MaxMatch: Integer): Integer;
134 var
135   NGram: string[8];
136   NGramCount: Integer;
137   i, Count: Integer;
138 begin
139   NGramCount := SearchStrLen - NGramLen + 1;
140   Count := 0;
141   MaxMatch := 0;
142   for i := 1 to NGramCount do
143   begin
144     NGram := Copy(SearchStr, i, NGramLen);
145     if (NGram[NGramLen - 1] = ' ') and (NGram[1] < > ' ') then
146       Inc(i, NGramLen - 3) {will be increased in the loop}
147     else
148     begin
149       Inc(MaxMatch, NGramLen);
150       if Pos(NGram, TextPara) > 0 then
151         Inc(Count);
152     end;
153   end;
154   NGramMatch := Count * NGramLen;
155 end;
156 
157 procedure FuzzyMatching(SearchStr: string; Treshold: Integer; var InFile: Text);
158 var
159   TextPara: string;
160   TextBuffer: string;
161   TextLen: Integer;
162   SearchStrLen: Integer;
163   NGram1Len: Integer;
164   NGram2Len: Integer;
165   MatchCount1: Integer;
166   MatchCount2: Integer;
167   MaxMatch1: Integer;
168   MaxMatch2: Integer;
169   Similarity: Real;
170   BestSim: Real;
171 begin
172   BestSim := 0.0;
173   SearchStrLen := PrepareTheString(SearchStr, SearchStr);
174   NGram1Len := 3;
175   if SearchStrLen < 7 then
176     NGram2Len := 2
177   else
178     NGram2Len := 5;
179   while not Eof(InFile) do
180   begin
181     Readln(InFile, TextBuffer);
182     TextLen := PrepareTheString(TextBuffer, TextPara) + 1;
183     TextPara := Concat(' ', TextPara);
184     if TextLen < MaxParLen - 2 then
185     begin
186       MatchCount1 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram1Len, 
187 MaxMatch1);
188       MatchCount2 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram2Len, 
189 MaxMatch2);
190       Similarity := 100.0 * (MatchCount1 + MatchCount2) / (MaxMatch1 + MaxMatch2);
191       if Similarity > BestSim then
192         BestSim := Similarity;
193       if Similarity >= Treshold then
194       begin
195         Writeln;
196         Writeln('[', Similarity, '] ', TextBuffer);
197       end;
198     end;
199   else
200     Writeln('Paragraph too long');
201 end;
202 if BestSim < Treshold then
203   Writeln('No match; Best Match was ', BestSim);
204 end;
205 
206 begin
207   Writeln;
208   Writeln('+------------------------------------------+');
209   Writeln('| Fuzzy Search in Information Retrieval |');
210   Writeln('|         (C) 1997 Reinhard Rapp           |');
211   Writeln('+------------------------------------------+');
212   Writeln;
213   write('Name of file to search in: ');
214   Readln(Filename);
215   write('Search string: ');
216   Readln(InputStr);
217   SearchStr := Concat(' ', InputStr, ' ');
218   write('Minimum hit quality in % : ');
219   Readln(Treshold);
220   if (Treshold > 0) and (Treshold <= 100) and (SearchStr < > '') and (Filename < > 
221 '') then
222   begin
223     Assign(InFile, Filename);
224     Reset(InFile);
225     FuzzyMatching(SearchStr, Treshold, InFile);
226     Close(InFile);
227   end;
228   Writeln;
229   Writeln('Bye!');
230 end.



Solve 3:

231 unit FuzzyMatch;
232 
233 {This unit provides a basic 'fuzzy match' index on how alike two strings are
234      The result is of type 'single': near 0 - poor match
235                                      near 1 - close match
236      The intention is that HowAlike(s1,s2)=HowAlike(s2,s1)
237      The Function is not case sensitive}
238 
239 interface
240 
241 uses sysutils;
242 
243 function HowAlike(s1, s2: string): single;
244 
245 implementation
246 
247 function instr(start: integer; ToSearch, ToFind: string): integer;
248 begin
249   //This is a quick implementation of the VB InStr, since Pos just doesn't do what 
250 is needed!!
251   //NB - case sensitive!!
252   if start > 1 then
253     Delete(ToSearch, 1, start - 1);
254   result := pos(ToFind, ToSearch);
255   if (result > 0) and (start > 1) then
256     inc(result, start);
257 end;
258 
259 function HowAlike(s1, s2: string): single;
260 var
261   l1, l2, pass, position, size, foundpos, maxscore: integer;
262   score, scored, string1pos, string2pos, bestmatchpos: single;
263   swapstring, searchblock: string;
264 begin
265   s1 := Uppercase(trim(s1));
266   s2 := Uppercase(trim(s2));
267 
268   score := 0;
269   maxscore := 0;
270   scored := 0;
271 
272   //deal with zero length strings...
273   if (s1 = '') and (s2 = '') then
274   begin
275     result := 1;
276     exit;
277   end
278   else if (s1 = '') or (s2 = '') then
279   begin
280     result := 0;
281     exit;
282   end;
283 
284   //why perform any mathematics is the result is clear?
285   if s1 = s2 then
286   begin
287     result := 1;
288     exit;
289   end;
290 
291   //make two passes,
292   //     with s1 and s2 each way round to ensure
293   //     consistent results
294   for pass := 1 to 2 do
295   begin
296     l1 := length(s1);
297     l2 := length(s2);
298     for size := l1 downto 1 do
299     begin
300       for position := 1 to (l1 - size + 1) do
301       begin
302         //try to find implied block in the other string
303         //Big blocks score much better than small blocks
304         searchblock := copy(s1, position, size);
305         foundpos := pos(searchblock, s2);
306 
307         if size = l1 then
308           string1pos := 0.5
309         else
310           string1pos := (position - 1) / (l1 - size);
311 
312         if foundpos > 0 then
313         begin
314           //the string is in somewhere in there
315           //    - find the 'closest' one.
316           bestmatchpos := -100; //won't find anything that far away!
317 
318           repeat
319             if size = l2 then
320               string2pos := 0.5
321             else
322               string2pos := (foundpos - 1) / (l2 - size);
323 
324             //If this closer than the previous best?
325             if abs(string2pos - string1pos) < abs(bestmatchpos - string1pos) then
326               bestmatchpos := string2pos;
327 
328             foundpos := instr(foundpos + 1, s2, searchblock);
329           until foundpos = 0; //loop while foundpos>0..
330 
331           //The closest position is now known: Score it!
332           //Score as follows: (1-distance of best match)
333           score := score + (1 - abs(string1pos - bestmatchpos));
334         end;
335 
336         //Keep track if the maximum possible score
337         //BE CAREFUL IF CHANGING THIS FUNCTION!!!
338 
339         //maxscore:=maxscore+1;
340         inc(maxscore);
341       end; //for position..
342     end; //for size..
343 
344     if pass = 1 then
345     begin
346       //swap the strings around
347       swapstring := s1;
348       s1 := s2;
349       s2 := swapstring;
350     end;
351 
352     //Each pass is weighted equally
353 
354     scored := scored + (0.5 * (score / maxscore));
355     score := 0;
356     maxscore := 0;
357   end; //for pass..
358 
359   //HowAlike=score/maxscore
360   result := scored;
361 end;



Solve 4:

A Delphi implementation of the Levenshtein Distance Algorithm

362 unit Levenshtein;
363 
364 {Objeto que calcula la distancia de Levenshtein entre 2 cadenas.
365 Alvaro Jeria Madariaga. 04/10/2002
366 barbaro@hotpop.com}
367 
368 interface
369 
370 uses
371   sysutils, Math;
372 
373 type
374   Tdistance = class(TObject)
375   private
376     function minimum(a, b, c: Integer): Integer;
377   public
378     function LD(s, t: string): Integer;
379   end;
380 
381 implementation
382 
383 function Tdistance.minimum(a, b, c: Integer): Integer;
384 var
385   mi: Integer;
386 begin
387   mi := a;
388   if (b < mi) then
389     mi := b;
390   if (c < mi) then
391     mi := c;
392   Result := mi;
393 end;
394 
395 function Tdistance.LD(s, t: string): Integer;
396 var
397   d: array of array of Integer;
398   n, m, i, j, costo: Integer;
399   s_i, t_j: char;
400 begin
401   n := Length(s);
402   m := Length(t);
403   if (n = 0) then
404   begin
405     Result := m;
406     Exit;
407   end;
408   if m = 0 then
409   begin
410     Result := n;
411     Exit;
412   end;
413   setlength(d, n + 1, m + 1);
414   for i := 0 to n do
415     d[i, 0] := i;
416   for j := 0 to m do
417     d[0, j] := j;
418   for i := 1 to n do
419   begin
420     s_i := s[i];
421     for j := 1 to m do
422     begin
423       t_j := t[j];
424       if s_i = t_j then
425         costo := 0
426       else
427         costo := 1;
428       d[i, j] := Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + costo);
429     end;
430   end;
431   Result := d[n, m];
432 end;
433 
434 end.


I've written some function that compares two strings and returns true, if they are 
identical or similar ('house' is similar to 'mouse', 'hose', 'houses' or 'horse'). 
It works quite good, but I guess there are other implementations around, and I'd 
like to compare mine to others. I'd also like to be able to find any identical or 
similar substrings inside a longer string, what's not possible with my (very 
simple) algorithm. Do you know about any sources or other documentation?

Levenshtein matching gives the number of steps (single character replacement or 
addition) needed to transform StringA into String B. Ratcliffe matching gives the 
percentage of possible character matches between StringA and StringB, based on the 
longest matching sequences and subsequences between the two strings.
435 
436 function CompareStrings_Levenshtein(const A, B: string; CaseSensitive: Boolean =
437   False): Integer;
438 
439   function Minimum3(x, y, z: Integer): Integer;
440   begin
441     Result := Min(x, y);
442     Result := Min(Result, z);
443   end;
444 
445 var
446   D: array of array of Integer;
447   n, m, i, j, Cost: Integer;
448   AI, BJ: Char;
449   A1, B1: string;
450 begin
451   n := Length(A);
452   m := Length(B);
453   if (n = 0) then
454     Result := m
455   else if m = 0 then
456     Result := n
457   else
458   begin
459     if CaseSensitive then
460       A1 := A
461     else
462       A1 := UpperCase(A);
463     if CaseSensitive then
464       B1 := B
465     else
466       B1 := UpperCase(B);
467     Setlength(D, n + 1, m + 1);
468     for i := 0 to n do
469       D[i, 0] := i;
470     for j := 0 to m do
471       D[0, j] := j;
472     for i := 1 to n do
473     begin
474       AI := A1[i];
475       for j := 1 to m do
476       begin
477         BJ := B1[j];
478         Cost := iff(AI = BJ, 0, 1);
479         D[i, j] := Minimum3(D[i - 1][j] + 1, D[i][j - 1] + 1, D[i - 1][j - 1] + 
480 Cost);
481       end;
482     end;
483     Result := D[n, m];
484   end;
485 end;
486 
487 function CompareStrings_Ratcliff(const A, B: string; CaseSensitive: Boolean = 
488 False):
489   Double;
490 var
491   A1, B1: string;
492   LenA, LenB: Integer;
493 
494   function CSRSub(StartA, EndA, StartB, EndB: Integer): Integer;
495   var
496     a, b, i, Matches, NewStartA, NewStartB: Integer;
497   begin
498     Result := 0;
499     NewStartA := 0;
500     NewStartB := 0;
501     if (StartA > EndA) or (StartB > EndB) or (StartA <= 0) or (StartB <= 0) then
502       Exit;
503     for a := StartA to EndA do
504     begin
505       for B := StartB to EndB do
506       begin
507         Matches := 0;
508         i := 0;
509         while (a + i <= EndA) and (b + i <= EndB) and (A1[a + i] = B1[b + i]) do
510         begin
511           Inc(Matches);
512           if Matches > Result then
513           begin
514             NewStartA := a;
515             NewStartB := b;
516             Result := Matches;
517           end;
518           Inc(i);
519         end;
520       end;
521     end;
522     if Result > 0 then
523     begin
524       Inc(Result, CSR_Sub(NewStartA + Result, EndA, NewStartB + Result, EndB));
525       Inc(Result, CSR_Sub(StartA, NewStartA - 1, StartB, NewStartB - 1));
526     end;
527   end;
528 
529 begin
530   if CaseSensitive then
531     A1 := A
532   else
533     A1 := UpperCase(A);
534   if CaseSensitive then
535     B1 := B
536   else
537     B1 := UpperCase(B);
538   LenA := Length(A1);
539   LenB := Length(B1);
540   if A1 = B1 then
541     Result := 100
542   else if (LenA = 0) or (LenB = 0) then
543     Result := 0
544   else
545     Result := CSR_Sub(1, LenA, 1, LenB) * 200 / (LenA + LenB);
546 end;


			
Vote: How useful do you find this Article/Tip?
Bad Excellent
1 2 3 4 5 6 7 8 9 10

 

Advertisement
Share this page
Advertisement
Download from Google

Copyright © Mendozi Enterprises LLC