I am trying to understand the implementation of the Rabin-Karp algorithm. d is the number of characters in the input alphabet, but if I replace 0 or any other value instead of 20, it won't affect anything. Why is this happening like this ?
// Rabin-Karp algorithm in C
#include <string.h>
#include <iostream>
using namespace std;
#define d 20
void rabinKarp(char pattern[], char text[], int q) {
int m = strlen(pattern);
int n = strlen(text);
int i, j;
int p = 0;
int t = 0;
int h = 1;
for (i = 0; i < m - 1; i )
h = (h * d) % q;
// Calculate hash value for pattern and text
for (i = 0; i < m; i ) {
p = (d * p pattern[i]) % q;
t = (d * t text[i]) % q;
}
// Find the match
for (i = 0; i <= n - m; i ) {
if (p == t) {
for (j = 0; j < m; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == m)
cout << "Pattern is found at position: " << i 1 << endl;
}
if (i < n - m) {
t = (d * (t - text[i] * h) text[i m]) % q;
if (t < 0)
t = (t q);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "KLXQW";
int q = 13;
rabinKarp(pattern, text, q);
}
CodePudding user response:
I believe the short answer is that the lower d
is the more hash collisions you will have, but you go about verifying the match anyway so it does not affect anything.
A bit more verbose:
First let me modify your code to be have more expressive variables:
// Rabin-Karp algorithm in C
#include <string.h>
#include <iostream>
using namespace std;
#define HASH_BASE 0
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 1;
for (i = 0; i < patternLen - 1; i )
patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i ) {
patternHash = (HASH_BASE * patternHash pattern[i]) % inputBase;
textHash = (HASH_BASE * textHash text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i ) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (HASH_BASE * (textHash - text[i] * patternLenOut) text[i patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash inputBase);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "EE";
int q = 13;
rabinKarp(pattern, text, q);
}
The easiest way to attack it is to set HASH_BASE
(previously d
) to zero and see where we can simplify. The rabinKarp function can then be reduced to:
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 0;
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i ) {
patternHash = (pattern[i]) % inputBase;
textHash = (text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i ) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (text[i patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash inputBase);
}
}
}
now you'll notice that all the hashes becomes is the sum of the letters mod some number (in your case 13, in my case 2). This is a bad hash, meaning many things will sum to the same number. However, in this portion of the code:
if (patternHash == textHash) {
for (j = 0; j < patternLen; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i 1 << endl;
}
you explicitly check the match, letter by letter, if the hashes match. The worse your hash function is, the more often you will have false positives (which will mean a longer runtime for your function). There are more details, but I believe that directly answers your question. What might be interesting is to record false positives and see how the false positive rate increases as d
and q
decrease.