-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_ocr_result.m
106 lines (89 loc) · 3.89 KB
/
import_ocr_result.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
function [VarName1,VarName2,VarName3,VarName4,VarName5] = import_ocr_result(filename, startRow, endRow)
%IMPORTFILE Import numeric data from a text file as column vectors.
% [VARNAME1,VARNAME2,VARNAME3,VARNAME4,VARNAME5] = IMPORTFILE(FILENAME)
% Reads data from text file FILENAME for the default selection.
%
% [VARNAME1,VARNAME2,VARNAME3,VARNAME4,VARNAME5] = IMPORTFILE(FILENAME,
% STARTROW, ENDROW) Reads data from rows STARTROW through ENDROW of text
% file FILENAME.
%
% Example:
% [VarName1,VarName2,VarName3,VarName4,VarName5] = importfile('results_captioncapture.txt',1, 89979);
%
% See also TEXTSCAN.
% Auto-generated by MATLAB on 2015/10/15 16:38:11
%% Initialize variables.
if nargin<=2
startRow = 1;
endRow = inf;
end
%% Read columns of data as strings:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%6s%6s%6s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to format string.
% This call is based on the structure of the file used to generate this
% code. If an error occurs for a different file, try regenerating the code
% from the Import Tool.
dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', '', 'WhiteSpace', '', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false);
for block=2:length(startRow)
frewind(fileID);
dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', '', 'WhiteSpace', '', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false);
for col=1:length(dataArray)
dataArray{col} = [dataArray{col};dataArrayBlock{col}];
end
end
%% Remove white space around all cell columns.
dataArray{1} = strtrim(dataArray{1});
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric strings to numbers.
% Replace non-numeric strings with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = dataArray{col};
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[2,3,4,5]
% Converts strings in the input cell array to numbers. Replaced non-numeric
% strings with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1);
% Create a regular expression to detect and remove non-numeric prefixes and
% suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData{row}, regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if any(numbers==',');
thousandsRegExp = '^\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(thousandsRegExp, ',', 'once'));
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric strings to numbers.
if ~invalidThousandsSeparator;
numbers = textscan(strrep(numbers, ',', ''), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch me
end
end
end
%% Split data into numeric and cell columns.
rawNumericColumns = raw(:, [2,3,4,5]);
rawCellColumns = raw(:, 1);
%% Replace non-numeric cells with NaN
R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells
rawNumericColumns(R) = {NaN}; % Replace non-numeric cells
%% Allocate imported array to column variable names
VarName1 = rawCellColumns(:, 1);
VarName2 = cell2mat(rawNumericColumns(:, 1));
VarName3 = cell2mat(rawNumericColumns(:, 2));
VarName4 = cell2mat(rawNumericColumns(:, 3));
VarName5 = cell2mat(rawNumericColumns(:, 4));