C 中的Rabin-Karp演算法-有解無憂

我試圖了解 Rabin-Karp 演算法的實作。d 是輸入字母表中的字符數，但如果我替換 0 或任何其他值而不是 20，則不會有任何影響。為什么會這樣？

    // Rabin-Karp algorithm in C  
#include <string.h>
#include <iostream>
using namespace std;
#define d 20

void rabinKarp(char pattern[], char text[], int q) {
    int m = strlen(pattern);
    int n = strlen(text);
    int i, j;
    int p = 0;
    int t = 0;
    int h = 1;

    for (i = 0; i < m - 1; i  )
        h = (h * d) % q;

    // Calculate hash value for pattern and text
    for (i = 0; i < m; i  ) {
        p = (d * p   pattern[i]) % q;
        t = (d * t   text[i]) % q;
    }

    // Find the match
    for (i = 0; i <= n - m; i  ) {
        if (p == t) {
            for (j = 0; j < m; j  ) {
                if (text[i   j] != pattern[j])
                    break;
            }

            if (j == m)
                cout << "Pattern is found at position: " << i   1 << endl;
        }

        if (i < n - m) {
            t = (d * (t - text[i] * h)   text[i   m]) % q;

            if (t < 0)
                t = (t   q);
        }
    }
}

int main() {

   // char text[] = "ABCCDXAEFGX";
    char text[] = "QWERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
    char pattern[] = "KLXQW";
    int q = 13;
    rabinKarp(pattern, text, q);
}

uj5u.com熱心網友回復：

我相信簡短的回答是，d哈希沖突越低，您將擁有的哈希沖突越多，但無論如何您都會驗證匹配，因此它不會影響任何事情。

稍微詳細一點：

首先讓我修改您的代碼以具有更具表現力的變數：

// Rabin-Karp algorithm in C  
#include <string.h>
#include <iostream>
using namespace std;
#define HASH_BASE 0

void rabinKarp(char pattern[], char text[], int inputBase) {
    int patternLen = strlen(pattern);
    int textLen = strlen(text);
    int i, j; //predefined iterators
    int patternHash = 0;
    int textHash = 0;
    int patternLenOut = 1;

    for (i = 0; i < patternLen - 1; i  )
        patternLenOut = (patternLenOut * HASH_BASE) % inputBase; // hash of pattern len

    // Calculate hash value for pattern and text
    for (i = 0; i < patternLen; i  ) {
        patternHash = (HASH_BASE * patternHash   pattern[i]) % inputBase;
        textHash = (HASH_BASE * textHash   text[i]) % inputBase;
    }

    // Find the match
    for (i = 0; i <= textLen - patternLen; i  ) {
        if (patternHash == textHash) {
            for (j = 0; j < patternLen; j  ) {
                if (text[i   j] != pattern[j])
                    break;
            }

            if (j == patternLen)
                cout << "Pattern is found at position: " << i   1 << endl;
        }

        if (i < textLen - patternLen) {
            textHash = (HASH_BASE * (textHash - text[i] * patternLenOut)   text[i   patternLen]) % inputBase;
            
            if (textHash < 0)
                textHash = (textHash   inputBase);
        }
    }
}

int main() {

   // char text[] = "ABCCDXAEFGX";
    char text[] = "QWEEERTYUIOPASDFGHJKLXQWERTYUIOPASDFGHJKLX";
    char pattern[] = "EE";
    int q = 13;
    rabinKarp(pattern, text, q);
}

攻擊它的最簡單方法是將HASH_BASE（以前的d）設定為零，然后看看我們可以簡化的地方。rabinKarp 函式可以簡化為：

void rabinKarp(char pattern[], char text[], int inputBase) {
    int patternLen = strlen(pattern);
    int textLen = strlen(text);
    int i, j; //predefined iterators
    int patternHash = 0;
    int textHash = 0;
    int patternLenOut = 0;

    // Calculate hash value for pattern and text
    for (i = 0; i < patternLen; i  ) {
        patternHash = (pattern[i]) % inputBase;
        textHash = (text[i]) % inputBase;
    }

    // Find the match
    for (i = 0; i <= textLen - patternLen; i  ) {
        if (patternHash == textHash) {
            for (j = 0; j < patternLen; j  ) {
                if (text[i   j] != pattern[j])
                    break;
            }

            if (j == patternLen)
                cout << "Pattern is found at position: " << i   1 << endl;
        }

        if (i < textLen - patternLen) {
            textHash = (text[i   patternLen]) % inputBase;
            
            if (textHash < 0)
                textHash = (textHash   inputBase);
        }
    }
}

現在你會注意到所有的哈希值都是字母 mod 一些數字的總和（在你的例子中是 13，在我的例子中是 2）。這是一個糟糕的散列，這意味著許多事情的總和會是相同的數字。但是，在這部分代碼中：

if (patternHash == textHash) {
    for (j = 0; j < patternLen; j  ) {
        if (text[i   j] != pattern[j])
              break;
    }
    
    if (j == patternLen)
        cout << "Pattern is found at position: " << i   1 << endl;
}

如果散列匹配，則逐個字母地明確檢查匹配。您的哈希函式越糟糕，誤報的頻率就越高（這意味著您的函式運行時間更長）。還有更多細節，但我相信直接回答了您的問題。可能有趣的是記錄誤報并查看誤報率是如何增加d和q減少的。

轉載請註明出處，本文鏈接：https://www.uj5u.com/gongcheng/374290.html

標籤：C 算法拉宾卡普

上一篇：確定到達二維陣列左上角的最小移動次數

下一篇：2個串列的最大路徑和