OpenVINO場景文字檢測與識別-有解無憂

OpenVINO提供的場景文字檢測模型準確率是非常的高，完全可以達到實用級別，其實OpenVINO還提供了另外一個場景文字識別的模型，總體使用下來的感覺是沒有場景文字檢測那么靠譜，而且只支持英文字母與數字識別，不支持中文，不得不說是一個小小遺憾，但是對比較干凈的檔案影像，它的識別準確率還是相當的高，速度也比較快，基本上都在毫秒基本出結果。

模型介紹
文本識別(OCR)模型采用的網路架構為基礎網路+雙向LSTM，其中基礎網路選擇的是VGG16，字母識別是非大小寫敏感的，26個字母+10個數字總計36個字符。其網路結構類似如下：

模型輸入結構為：

[BxCxHxW]=1x1x32x120

其中B表示批次、C表示通道、H表示高度、W表示寬度
模型輸出結果為：

[WxBxL] = 30x1x37

其中B表示批次、W表示輸出序列長度、L表示各個37個字符各自得分，其中第37個是#
輸出部分的決議基于CTC貪心解碼方式。
代碼實作

加載模型

# 加載IR

log.info("Reading IR...")

net = IENetwork(model=model_xml, weights=model_bin)

text_net = IENetwork(model=text_xml, weights=text_bin)

場景文字檢測

# image = cv2.imread("D:/images/openvino_ocr.png");

image = cv2.imread("D:/images/cover_01.jpg");

cv2.imshow("image", image)

inf_start = time.time()

in_frame = cv2.resize(image, (w, h))

in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW

in_frame = in_frame.reshape((n, c, h, w))

exec_net.infer(inputs={input_blob: in_frame})

ROI截取與文字識別

x, y, width, height = cv2.boundingRect(contours[c])

roi = image[y-5:y+height+10,x-5:x+width+10,:]

gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

text_roi = cv2.resize(gray, (tw, th))

text_roi = np.expand_dims(text_roi, 2)

text_roi = text_roi.transpose((2, 0, 1))

text_roi = text_roi.reshape((tn, tc, th, tw))

text_exec_net.infer(inputs={input_blob: text_roi})

text_out = text_exec_net.requests[0].outputs[text_out_blob]

CTC決議結果

# 決議輸出text

ocrstr = ""

prev_pad = False;

for i in range(text_out.shape[0]):

    ctc = text_out[i]

    ctc = np.squeeze(ctc, 0)

    index, prob = ctc_soft_max(ctc)

    if alphabet[index] == '#':

        prev_pad = True

    else:

        if len(ocrstr) == 0 or prev_pad or (len(ocrstr) > 0 and alphabet[index] != ocrstr[-1]):

            prev_pad = False

            ocrstr += alphabet[index]

輸出文字檢測與識別結果

# 顯示識別結果

print("result: %s"%ocrstr)

cv2.drawContours(image, [box], 0, (0, 255, 0), 2)

cv2.putText(image, ocrstr, (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.75, (255, 0, 0), 1)

最后送上整個演示代碼

def demo():

    # 加載MKLDNN - CPU Target

    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)

    plugin = IEPlugin(device="CPU", plugin_dirs=plugin_dir)

    plugin.add_cpu_extension(cpu_extension)



    # 加載IR

    log.info("Reading IR...")

    net = IENetwork(model=model_xml, weights=model_bin)

    text_net = IENetwork(model=text_xml, weights=text_bin)



    if plugin.device == "CPU":

        supported_layers = plugin.get_supported_layers(net)

        not_supported_layers = [l for l in net.layers.keys() if l not in supported_layers]

        if len(not_supported_layers) != 0:

            log.error("Following layers are not supported by the plugin for specified device {}:\n {}".

                      format(plugin.device, ', '.join(not_supported_layers)))

            log.error("Please try to specify cpu extensions library path in demo's command line parameters using -l "

                      "or --cpu_extension command line argument")

            sys.exit(1)



    # 獲取輸入輸出層

    input_blob = next(iter(net.inputs))

    outputs = iter(net.outputs)



    # 獲取多個輸出層名稱

    out_blob = next(outputs)

    second_blob = next(outputs)

    log.info("Loading IR to the plugin...")

    print("pixel output: %s, link output: %s \n"%(out_blob, second_blob))



    text_input_blob = next(iter(text_net.inputs))

    text_out_blob = next(iter(text_net.outputs))

    print("text_out_blob : %s"%text_out_blob)



    # 創建可執行網路

    exec_net = plugin.load(network=net)

    text_exec_net = plugin.load(network=text_net)



    # Read and pre-process input image

    n, c, h, w = net.inputs[input_blob].shape

    tn, tc, th, tw = text_net.inputs[text_input_blob].shape

    del net

    del text_net



    log.info("Starting inference in async mode...")

    log.info("To switch between sync and async modes press Tab button")

    log.info("To stop the demo execution press Esc button")



    image = cv2.imread("D:/images/openvino_ocr.png");

    # image = cv2.imread("D:/images/cover_01.jpg");

    cv2.imshow("image", image)

    inf_start = time.time()

    in_frame = cv2.resize(image, (w, h))

    in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW

    in_frame = in_frame.reshape((n, c, h, w))

    exec_net.infer(inputs={input_blob: in_frame})

    inf_end = time.time()

    det_time = inf_end - inf_start



    # 獲取輸出

    res1 = exec_net.requests[0].outputs[out_blob]

    res2 = exec_net.requests[0].outputs[second_blob]



    # 降維

    res1 = np.squeeze(res1, 0)

    res2 = np.squeeze(res2, 0)



    # 矩陣轉置

    res1 = res1.transpose((1, 2, 0))

    res2 = res2.transpose((1, 2, 0))



    h, w = res1.shape[:2]

    print(res1.shape)

    print(res2.shape)



    # 文本與非文本像素

    pixel_mask = np.zeros((h, w), dtype=np.uint8)



    # 決議輸出結果

    res1 = soft_max(res1)



    # 像素分割

    for row in range(h):

        for col in range(w):

            pv2 = res1[row, col, 1]

            if pv2 > 0.50:

                pixel_mask[row, col] = 255



    se = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))

    mask = cv2.morphologyEx(pixel_mask, cv2.MORPH_CLOSE, se)

    cv2.imshow("text mask", mask)

    cv2.imwrite("D:/mask.png", mask)



    # 后處理，檢測框

    h, w = image.shape[:2]

    mask = cv2.resize(mask, (w, h))

    contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for c in range(len(contours)):

        rect = cv2.minAreaRect(contours[c])

        box = cv2.boxPoints(rect)

        box = np.int0(box)



        x, y, width, height = cv2.boundingRect(contours[c])

        roi = image[y-5:y+height+10,x-5:x+width+10,:]

        gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

        text_roi = cv2.resize(gray, (tw, th))

        text_roi = np.expand_dims(text_roi, 2)

        text_roi = text_roi.transpose((2, 0, 1))

        text_roi = text_roi.reshape((tn, tc, th, tw))

        text_exec_net.infer(inputs={input_blob: text_roi})

        text_out = text_exec_net.requests[0].outputs[text_out_blob]



        # 決議輸出text

        ocrstr = ""

        prev_pad = False;

        for i in range(text_out.shape[0]):

            ctc = text_out[i]

            ctc = np.squeeze(ctc, 0)

            index, prob = ctc_soft_max(ctc)

            if alphabet[index] == '#':

                prev_pad = True

            else:

                if len(ocrstr) == 0 or prev_pad or (len(ocrstr) > 0 and alphabet[index] != ocrstr[-1]):

                    prev_pad = False

                    ocrstr += alphabet[index]



        # 顯示識別結果

        print("result: %s"%ocrstr)

        cv2.drawContours(image, [box], 0, (0, 255, 0), 2)

        cv2.putText(image, ocrstr, (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.75, (255, 0, 0), 1)



    inf_time_message = "Inference time: {:.3f} ms， FPS:{:.3f}".format(det_time * 1000, 1000 / (det_time * 1000))

    cv2.putText(image, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 255, 0), 1)

    cv2.imshow("result", image)

    cv2.imwrite("D:/result.png", image)

    cv2.waitKey(0)



    # 釋放資源

    cv2.destroyAllWindows()

    del exec_net

    del plugin

演示效果
OCR識別輸出 - 效果一

OCR識別輸出 - 效果二

總結：
發現對特定的應用場景，特別是一些檔案化的影像，這個模型識別還比較準確，對很多其它的應用場景，比如身份證、各種卡號識別，發現誤識別率很高，現如這些場景需要專項訓練的模型！

轉載請註明出處，本文鏈接：https://www.uj5u.com/yidong/175299.html

標籤：英特爾技術

上一篇：android 如何強制關閉app同時保存檔案

下一篇：審核2.5.14問題，有解決方案嗎？