-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr-combiner.py
executable file
·102 lines (84 loc) · 3.54 KB
/
ocr-combiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 18 17:21:22 2020
@author: briancroxall
This script will combine the different pages of an issue of a newspaper into
a single document.
"""
from glob import glob
import os
from collections import defaultdict
from datetime import datetime
startTime = datetime.now()
def get_filename(file):
no_dir = file.split('/')[-1]
no_ext = no_dir.split('.')[:-1]
joined = '.'.join(no_ext)
return joined
# Corpus
corpus = sorted(glob('ocr-txt/*.txt'))
test = ['ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p1.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p2.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p3.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p4.txt']
test2 = ['ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p1.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p2.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p3.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p4.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p5.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p6.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p7.txt',
'ocr-txt/the-morning-news_savannah,-ga._1891-02-25_p8.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p1.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p2.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p3.txt',
'ocr-txt/the-newberry-herald-and-news_newberry,-s.c._1890-12-18_p4.txt']
# Directories
if not os.path.isdir('combined-ocr'):
os.mkdir('combined-ocr')
# Dictionaries
newsdict = defaultdict(list)
# Build dictionary
for counter, each in enumerate(corpus):
if counter % 100 == 0:
print('.', end='', flush=True)
filename = get_filename(each)
newspaper, location, date, page = filename.split('_')
try:
newsdict[(newspaper, location, date)].append(page)
except KeyError:
newsdict[(newspaper, location, date)] = []
print('\nDictionary built')
print('Expected number of newspaper issues: ', len(newsdict))
print('\nStarting to combine texts')
for (newspaper, location, date), pages in newsdict.items():
for page in pages:
with open(f'ocr-txt/{newspaper}_{location}_{date}_{page}.txt') as ocr:
text = ocr.read()
if page == 'p1':
with open(f'combined-ocr/{newspaper}_{location}_{date}.txt', 'w') as save_file:
print(text, file=save_file)
else:
with open(f'combined-ocr/{newspaper}_{location}_{date}.txt', 'a') as save_file:
print(text, file=save_file)
num_issues = len(glob('combined-ocr/*.txt'))
print('\nTotal number of combined newspaper issues: ', num_issues)
if len(newsdict) == num_issues:
print('Everything looks okay!')
else:
print('Brian, we have a problem!')
print('\nTime elapsed: ', datetime.now() - startTime)
"""
for each in test:
filename = get_filename(each)
newspaper, date, page = filename.split('_')
with open(each) as read_file:
text = read_file.read()
if page == 'p1':
with open(f'combined-ocr/{newspaper}_{date}.txt', 'w') as save_file:
print(text, file=save_file)
else:
with open(f'combined-ocr/{newspaper}_{date}.txt', 'a') as save_file:
print(text, file=save_file)
"""