|
|
|
|
|
01234567890
i0=2 i=6 (i0 = i - j)
| |
v v
T = abbadabacba
P = abcad
^
|
j=4
|
01234567890
i0=0 i0+j (i = i0 + j)
| |
v v
T = abbadabacba
P = abcad
^
|
j=4
|
|
01234567890123
i0=3 i0+j (i = i0 + j)
| |
v v
T = abaabbaxabacba x does not occur is P
P = abcad
^
|
j=4
|
How to update the variables i0 and j:
Before:
01234567890123
i0=3 i0+j (i = i0 + j)
| |
v v
T = abaabbaxabacba
P = abcad
^
|
j=4
|
01234567890123456789012
i0=3 i0+j (i = i0 + j)
| |
v v
T = abaabbabaxabacbaabababc
P = abcaaadab
^
|
j=8
P[j] == T[i0+j] ==> j-- (Compare the previous character)
|
How to update the variables i0 and j:
Before:
01234567890123456789012
i0=3 i0+j (i = i0 + j)
| |
v v
T = abaabbabaxabacbaabababc
P = abcaaadab
^
|
j=6
|
|
Why does the heuristic work:
T = ................x......
P = ..x...x.......
^^^
These characters WILL cause a mismatch with x !!
|
012345678901234567890
i0=0 i0+j
| |
v v
T = abbababcabacbaabababc
P = abcacabdab
^ ^
| |
| j=7
|
right-most occurence of c in P
|
|
Advanced note:
|
012345
P = tomato
The lastOcc() function of P is:
lastOcc('a') = 3
lastOcc('m') = 2
lastOcc('o') = 5
lastOcc('t') = 4
and for all other characterss:
lastOcc(.) = -1
|
public static int[] buildLastFunction (String P)
{
int[] lastOcc = new int[128]; // assume ASCII character set
/* =========================================
Initialize every element to -1
========================================= */
for (int i = 0; i < 128; i++)
{
lastOcc[i] = -1; // initialize all elements to -1
}
/* ===============================================
Update lastOcc[c] with position of character c
=============================================== */
for (int pos = 0; pos < P.length(); pos++)
{
c = P.charAt(pos);
lastOcc[ c ] = pos; // ONLY The LAST position will be retained !
}
return lastOcc;
}
|
0123456789012345678901234
i0=4 i0+j
| |
v v
T = abcaabbababcabacbaabababc ('c' == T[i0+j])
P = abcacabdab
^ ^
| |
| j=7
|
lastOcc['c'] = 4 or better: lastOcc[ T[i0+j] ] = 4
<->
j - lastOcc[ T[i0+j] ] <==== amount to slide pattern P !!
|
|
Proof:
|
|
mismatch
|
v
T = .............x..............
P = ..x........x....
^
|
right-most occurrence of
mismatched character in P
|
when the right-most occurrence of the mismatched character is located further "down" in the pattern
0123456789012345678901234
i0=4 i0+j
| |
v v
T = abaaabbababcabcacbaabababc ('c' == T[i0+j])
P = abcacabdabc
^ ^
| |
j=7 |
|
lastOcc['c'] = 10
<->
j - lastOcc[ T[i0+j] ] = -3 ?!
|
|
|
In terms of program statements, this solution is coded as follows:
0123456789012345678901234
i0=4 i0+j
| |
v v
T = abaaabbababcabcacbaabababc ('c' == T[i0+j])
P = abcacabdabc
^ ^
| |
j=7 |
|
lastOcc['c'] = 10
j < lastOcc[ T[i0+j] ] !!!!
if ( j < lastOcc[ T[i0+j] ] )
{
i0++; // Slide pattern 1 character further
j = m-1; // Restart matching from the last char in P
}
else
{
i0 = i0 + j - lastOcc[T.charAt(i0+j)]; // FAST slide
j = m-1; // Restart matching from the last char in P
}
|
BoyerMooreSimp(T, P)
{
n = T.length();
m = P.length();
computeLastOcc(P); // Find last positions of all characters in P
i0 = 0; // Line P up at T[0]
while ( i0 < (n-m) )
{
j = m-1; // Start at the last char in P
while ( P[j] == T[i0+j] )
{
j--; // Check "next" (= previous) character
if ( j < 0 )
return (i0); // P found !
}
/* ====================================================
If program reaches this place, we have a mismatch
between P[j] <=> T[i0+j]
==================================================== */
if ( j < lastOcc[T.charAt(i0+j)] )
{
/* ============================
Handle bad character caveat
============================ */
i0++; // Slide P 1 character further (Goodrich)
// "j = m-1" is executed by the start of the loop...
}
else
{
i0 = i0 + j - lastOcc[T.charAt(i0+j)];
// "j = m-1" is executed by the start of the loop...
}
}
return -1; // P not found in T
}
|
public static int[] computeLastOcc(String P)
{
int[] lastOcc = new int[128]; // assume ASCII character set
for (int i = 0; i < 128; i++)
{
lastOcc[i] = -1; // initialize all elements to -1
}
for (int i = 0; i < P.length(); i++)
{
lastOcc[P.charAt(i)] = i; // The LAST value will be store
}
return lastOcc;
}
|
How to run the program:
|
Sample output:
+++++++++++++++++++++++++++++++++++++
=====================================
Matching: i = 5, j = 5
01234567890123456789
abacaxbaccabacbbaabb
abacbb
012345
^
| *** slide past "non-occurring" char
****** lastOcc['x'] = -1
+++++++++++++++++++++++++++++++++++++
=====================================
Matching: i = 11, j = 5
01234567890123456789
abacaxbaccabacbbaabb
abacbb
012345
^
|
=====================================
Matching: i = 10, j = 4
01234567890123456789
abacaxbaccabacbbaabb
abacbb
012345
^
| *** line up with last occ
****** lastOcc['a'] = 2
+++++++++++++++++++++++++++++++++++++
=====================================
Matching: i = 13, j = 5
01234567890123456789
abacaxbaccabacbbaabb
abacbb
012345
^
| *** line up with last occ
****** lastOcc['c'] = 3
+++++++++++++++++++++++++++++++++++++
=====================================
Matching: i = 15, j = 5
01234567890123456789
abacaxbaccabacbbaabb
abacbb
012345
^
|
P found !!!
|
|
Instead, I'll spend time teaching the Boyer-Moore-Horspool algorithm