我試圖了解 Rabin-Karp 演算法的實作。d 是輸入字母表中的字符數,但如果我替換 0 或任何其他值而不是 20,則不會有任何影響。為什么會這樣?
// Rabin-Karp algorithm in C
#include <string.h>
#include <iostream>
using namespace std;
#define d 20
void rabinKarp(char pattern[], char text[], int q) {
int m = strlen(pattern);
int n = strlen(text);
int i, j;
int p = 0;
int t = 0;
int h = 1;
for (i = 0; i < m - 1; i )
h = (h * d) % q;
// Calculate hash value for pattern and text
for (i = 0; i < m; i ) {
p = (d * p pattern[i]) % q;
t = (d * t text[i]) % q;
}
// Find the match
for (i = 0; i <= n - m; i ) {
if (p == t) {
for (j = 0; j < m; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == m)
cout << "Pattern is found at position: " << i 1 << endl;
}
if (i < n - m) {
t = (d * (t - text[i] * h) text[i m]) % q;
if (t < 0)
t = (t q);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "KLXQW";
int q = 13;
rabinKarp(pattern, text, q);
}
uj5u.com熱心網友回復:
我相信簡短的回答是,d哈希沖突越低,您將擁有的哈希沖突越多,但無論如何您都會驗證匹配,因此它不會影響任何事情。
稍微詳細一點:
首先讓我修改您的代碼以具有更具表現力的變數:
// Rabin-Karp algorithm in C
#include <string.h>
#include <iostream>
using namespace std;
#define HASH_BASE 0
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 1;
for (i = 0; i < patternLen - 1; i )
patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i ) {
patternHash = (HASH_BASE * patternHash pattern[i]) % inputBase;
textHash = (HASH_BASE * textHash text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i ) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (HASH_BASE * (textHash - text[i] * patternLenOut) text[i patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash inputBase);
}
}
}
int main() {
// char text[] = "ABCCDXAEFGX";
char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
char pattern[] = "EE";
int q = 13;
rabinKarp(pattern, text, q);
}
攻擊它的最簡單方法是將HASH_BASE(以前的d)設定為零,然后看看我們可以簡化的地方。rabinKarp 函式可以簡化為:
void rabinKarp(char pattern[], char text[], int inputBase) {
int patternLen = strlen(pattern);
int textLen = strlen(text);
int i, j; //predefined iterators
int patternHash = 0;
int textHash = 0;
int patternLenOut = 0;
// Calculate hash value for pattern and text
for (i = 0; i < patternLen; i ) {
patternHash = (pattern[i]) % inputBase;
textHash = (text[i]) % inputBase;
}
// Find the match
for (i = 0; i <= textLen - patternLen; i ) {
if (patternHash == textHash) {
for (j = 0; j < patternLen; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i 1 << endl;
}
if (i < textLen - patternLen) {
textHash = (text[i patternLen]) % inputBase;
if (textHash < 0)
textHash = (textHash inputBase);
}
}
}
現在你會注意到所有的哈希值都是字母 mod 一些數字的總和(在你的例子中是 13,在我的例子中是 2)。這是一個糟糕的散列,這意味著許多事情的總和會是相同的數字。但是,在這部分代碼中:
if (patternHash == textHash) {
for (j = 0; j < patternLen; j ) {
if (text[i j] != pattern[j])
break;
}
if (j == patternLen)
cout << "Pattern is found at position: " << i 1 << endl;
}
如果散列匹配,則逐個字母地明確檢查匹配。您的哈希函式越糟糕,誤報的頻率就越高(這意味著您的函式運行時間更長)。還有更多細節,但我相信直接回答了您的問題。可能有趣的是記錄誤報并查看誤報率是如何增加d和q減少的。
轉載請註明出處,本文鏈接:https://www.uj5u.com/gongcheng/374290.html
下一篇:2個串列的最大路徑和
