test_kindai_1.0.py

"""  
Copyright (c) 2019-present NAVER Corp.
MIT License
"""

# -*- coding: cp932 -*-
import sys
import os
import time
import argparse

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.autograd import Variable

from PIL import Image, ImageDraw, ImageFont
from utils import dataIterator, load_dict, gen_sample, load_mapping
from encoder_decoder import Encoder_Decoder

import cv2
from skimage import io
import numpy as np
import craft_utils
import imgproc
import file_utils
import json
import zipfile
import xml.etree.cElementTree as ET
import xml.dom.minidom as minidom
import codecs
from craft import CRAFT

from collections import OrderedDict
def copyStateDict(state_dict):
    if list(state_dict.keys())[0].startswith("module"):
        start_idx = 1
    else:
        start_idx = 0
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = ".".join(k.split(".")[start_idx:])
        new_state_dict[name] = v
    return new_state_dict

def str2bool(v):
    return v.lower() in ("yes", "y", "true", "t", "1")

def pil2cv(imgPIL):
    imgCV_RGB = np.array(imgPIL, dtype = np.uint8)
    imgCV_BGR = np.array(imgPIL)[:, :, ::-1]
    return imgCV_BGR

def cv2pil(imgCV):
    imgCV_RGB = imgCV[:, :, ::-1]
    imgPIL = Image.fromarray(imgCV_RGB)
    return imgPIL
def cv2_putChar(draw, char, x, y, fontPIL, colorRGB):
    draw.text(xy = (x,y), text = char, fill = colorRGB, font = fontPIL)

def cv2_putText_1(img, text, org, fontFace, fontScale, color):
    min_x, max_x, min_y, max_y = org
    
    imgPIL = cv2pil(img)
    draw = ImageDraw.Draw(imgPIL)
    fontPIL = ImageFont.truetype(font = fontFace, size = fontScale)
    if max_x - min_x >= max_y- min_y:
        #horizontal line
        y =  max_y
        x = min_x
        for char in text:
             cv2_putChar(draw, char, x, y, fontPIL, color )
             w, h = draw.textsize(char, font = fontPIL)
             x += w + 10
    else:
        #vertical line
        y = min_y
        x = max_x - 10
        for char in text:
             cv2_putChar(draw, char, x, y, fontPIL, color )
             w, h = draw.textsize(char, font = fontPIL)
             y += h + 10
    imgCV = pil2cv(imgPIL)
    return imgCV


parser = argparse.ArgumentParser(description='Kindai document Recognition')
#params for text detection
parser.add_argument('--trained_model', default='./pretrain/synweights_4600.pth', type=str, help='pretrained model')
parser.add_argument('--text_threshold', default=0.7, type=float, help='text confidence threshold')
parser.add_argument('--low_text', default=0.4, type=float, help='text low-bound score')
parser.add_argument('--link_threshold', default=0.4, type=float, help='link confidence threshold')
parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model')
parser.add_argument('--canvas_size', default=1000, type=int, help='image size for inference')
parser.add_argument('--mag_ratio', default=2, type=float, help='image magnification ratio')
parser.add_argument('--poly', default=False, action='store_true', help='enable polygon type')
parser.add_argument('--show_time', default=True, action='store_true', help='show processing time')
parser.add_argument('--test_folder', default='/data/', type=str, help='folder path to input images')

#params for text recognition
parser.add_argument('--model_path', default='./pretrain/WAP_params.pkl', type=str)
parser.add_argument('--dictionary_target', default='./pretrain/kindai_voc.txt', type=str)


args = parser.parse_args()


""" For test images in a folder """
image_list, _, _ = file_utils.get_files('./data/test')

result_folder = './data/result1/'
if not os.path.isdir(result_folder):
    os.mkdir(result_folder)

def test_net(net, image, text_threshold, link_threshold, low_text, cuda, poly):
    t0 = time.time()

    # resize
    img_resized, target_ratio, size_heatmap = imgproc.resize_aspect_ratio(image, args.canvas_size, interpolation=cv2.INTER_LINEAR, mag_ratio=args.mag_ratio)
    ratio_h = ratio_w = 1 / target_ratio

    # preprocessing
    x = imgproc.normalizeMeanVariance(img_resized)
    x = torch.from_numpy(x).permute(2, 0, 1)    # [h, w, c] to [c, h, w]
    x = Variable(x.unsqueeze(0))                # [c, h, w] to [b, c, h, w]
    if cuda:
        x = x.cuda()
    # forward pass
    y, _ = net(x)

    # make score and link map
    score_text = y[0,:,:,0].cpu().data.numpy()
    score_link = y[0,:,:,1].cpu().data.numpy()

    t0 = time.time() - t0
    t1 = time.time()

    # Post-processing
    boxes, polys = craft_utils.getDetBoxes(score_text, score_link, text_threshold, link_threshold, low_text, poly)

    # coordinate adjustment
    boxes = craft_utils.adjustResultCoordinates(boxes, ratio_w, ratio_h)
    polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h)
    for k in range(len(polys)):
        if polys[k] is None: polys[k] = boxes[k]

    t1 = time.time() - t1

    # render results (optional)
    render_img = score_text.copy()
    render_img = np.hstack((render_img, score_link))
    ret_score_text = imgproc.cvt2HeatmapImg(render_img)

    if args.show_time : print("\ninfer/postproc time : {:.3f}/{:.3f}".format(t0, t1))

    return boxes, polys, ret_score_text


def test(text_detection_modelpara, ocr_modelpara, dictionary_target):
    # load net
    net = CRAFT()     # initialize

    print('Loading text detection model from checkpoint {}'.format(text_detection_modelpara))
    if args.cuda:
        net.load_state_dict(copyStateDict(torch.load(text_detection_modelpara)))
    else:
        net.load_state_dict(copyStateDict(torch.load(text_detection_modelpara, map_location='cpu')))

    if args.cuda:
        net = net.cuda()
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = False


    params = {}
    params['n'] = 256
    params['m'] = 256
    params['dim_attention'] = 512
    params['D'] = 684
    params['K'] = 5748
    params['growthRate'] = 24
    params['reduction'] = 0.5
    params['bottleneck'] = True
    params['use_dropout'] = True
    params['input_channels'] = 3
    params['cuda'] = args.cuda

    # load model
    OCR = Encoder_Decoder(params)
    if args.cuda:
        OCR.load_state_dict(copyStateDict(torch.load(ocr_modelpara)))
    else:
        OCR.load_state_dict(copyStateDict(torch.load(ocr_modelpara, map_location='cpu')))
    if args.cuda:
        #OCR = OCR.cuda()
        OCR = torch.nn.DataParallel(OCR)
        cudnn.benchmark = False


    OCR.eval()
    net.eval()

    # load dictionary
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk
    t = time.time()

    fontPIL = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf' # japanese font
    size = 40
    colorBGR = (0,0,255) 

    
    paper = ET.Element('paper') 
    paper.set('xmlns', "http://codh.rois.ac.jp/modern-magazine/")
    # load data
    for k, image_path in enumerate(image_list[:]):
        print("Test image {:d}/{:d}: {:s}".format(k+1, len(image_list), image_path), end='\r')
        res_img_file = result_folder + "res_" + os.path.basename(image_path)

        #print (res_img_file, os.path.basename(image_path), os.path.exists(res_img_file)) 
        #if os.path.exists(res_img_file): continue
        #image = imgproc.loadImage(image_path)
        '''image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        ret2,image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        height = image.shape[0]
        width = image.shape[1]
        scale = 1000.0/height
        H = int(image.shape[0] * scale)
        W = int(image.shape[1] * scale)
        image = cv2.resize(image , (W, H))
        print(image.shape, image_path)
        cv2.imwrite(image_path, image) 
        continue'''
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w = image.shape[0], image.shape[1]
        print(image_path)
        page = ET.SubElement(paper, "page") 
        page.set('file', os.path.basename(image_path).replace('.jpg', ''))
        page.set('height', str(h))
        page.set('width', str(w))
        page.set('dpi', str(100))
        page.set('number', str(1))

        bboxes, polys, score_text = test_net(net, image, args.text_threshold, args.link_threshold, args.low_text, args.cuda, args.poly)
        text = []
        localtions = []
        for i, box in enumerate(bboxes):
            poly = np.array(box).astype(np.int32)
            min_x = np.min(poly[:,0])
            max_x = np.max(poly[:,0])
            min_y = np.min(poly[:,1])
            max_y = np.max(poly[:,1])
            if min_x < 0: 
                min_x = 0
            if min_y < 0:
                min_y = 0

            #image = cv2.rectangle(image,(min_x,min_y),(max_x,max_y),(0,255,0),3)
            input_img = image[min_y:max_y, min_x:max_x]

            w = max_x - min_x + 1
            h = max_y - min_y + 1
            line = ET.SubElement(page, "line") 
            line.set("x", str(min_x))
            line.set("y", str(min_y))
            line.set("height", str(h))
            line.set("width", str(w))
            if w < h:
                rate = 20.0/w
                w = int(round(w*rate))
                h = int(round(h* rate / 20.0) * 20)
            else:
                rate = 20.0/h
                w = int(round(w*rate / 20.0) * 20)
                h = int(round(h* rate))
            #print (w, h, rate)
            input_img = cv2.resize(input_img, (w,h))
            
            mat = np.zeros([1, h, w], dtype='uint8')  
            mat[0,:,:] = 0.299* input_img[:, :, 0] + 0.587 * input_img[:, :, 1] + 0.114 * input_img[:, :, 2]

            xx_pad = mat.astype(np.float32) / 255.
            xx_pad = torch.from_numpy(xx_pad[None, :, :, :])  # (1,1,H,W)
            if args.cuda:
                xx_pad.cuda()
            with torch.no_grad():
                sample, score, alpha_past_list = gen_sample(OCR, xx_pad, params, args.cuda, k=10, maxlen=600)
            score = score / np.array([len(s) for s in sample])
            ss = sample[score.argmin()]
            alpha_past = alpha_past_list[score.argmin()]
            result = ''
            i = 0
            location = []
            for vv in ss:
                
                if vv == 0: # <eol>
                    break
                alpha = alpha_past[i]
                if i != 0: alpha = alpha_past[i] - alpha_past[i-1]
                (y, x) = np.unravel_index(np.argmax(alpha, axis=None), alpha.shape)
                #print (int(16* x /rate), int(16* y/rate) , chr(int(worddicts_r[vv],16)))
                location.append([int(16* x/rate) + min_x, int(16* y/rate) + min_y])
                #image = cv2.circle(image,(int(16* x/rate) -  8 + min_x, int(16* y/rate) + 8 + min_y),25, (0,0,255), -1)
                

                result += chr(int(worddicts_r[vv],16))
                '''char = ET.SubElement(line, "char") 
                char.set('num_cand', '1') 
                char.set('x', str(int(16* x/rate) -  8 + min_x)) 
                char.set('y', str(int(16* y/rate) + 8 + min_y)) 
                res = ET.SubElement(char, "result") 
                res.set('CC', str(100))
                res.text = chr(int(worddicts_r[vv],16))
                cand = ET.SubElement(char, "cand") 
                cand.set('CC', str(100))
                cand.text = chr(int(worddicts_r[vv],16))'''
                i+=1 
            line.text = result
            text.append(result)
            localtions.append(location)
            image = cv2_putText_1(img = image, text = result, org = (min_x, max_x, min_y, max_y), fontFace = fontPIL, fontScale = size, color = colorBGR)


        print('save image')    
        # save score text
        filename, file_ext = os.path.splitext(os.path.basename(image_path))
        mask_file = result_folder + "/res_" + filename + '_mask.jpg'
        #cv2.imwrite(mask_file, score_text)
        file_utils.saveResult(image_path, image, polys, dirname=result_folder)

    xml_string = ET.tostring(paper, 'Shift_JIS')
       
    fout = codecs.open('./data/result.xml', 'w', 'shift_jis')
    fout.write(xml_string.decode('shift_jis'))
    fout.close()


    print("elapsed time : {}s".format(time.time() - t))

if __name__ == "__main__":
    test(args.trained_model, args.model_path, args.dictionary_target)