Skip to content

Commit

Permalink
Scrape news sites and test scraping authors:
Browse files Browse the repository at this point in the history
  • Loading branch information
bomanimc committed May 11, 2019
1 parent fccb492 commit a016ada
Show file tree
Hide file tree
Showing 5 changed files with 2,391 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
samples/
cleaned/
33 changes: 33 additions & 0 deletions authors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import csv
import datetime

INPUT_CSV_NAME = 'black_news'
INPUT_FILE_PATH = 'black_news.csv'
OUTPUT_FILE_BASE = 'cleaned/cleaned_'

def main():
sentences = []
with open(INPUT_FILE_PATH, mode='r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
invalid_count = 0
for row in csv_reader:
author = row[2].strip()
if (author != ''):
sentences.append(author)
line_count += 1
else:
invalid_count += 1
print("Initial Sentence Lines: ", line_count)
print("Invalid Count: ", invalid_count)

sentences = list(set(sentences))
print("Num Unique Sentence Lines: ", len(sentences))

write_time = datetime.datetime.now()
output_file = OUTPUT_FILE_BASE + "_" + INPUT_CSV_NAME + "_" + str(write_time) + '.csv'
with open(output_file, mode='a') as out_file:
for cnt, sent in enumerate(sentences):
out_file.write(sent + '\n')

main()
Loading

0 comments on commit a016ada

Please sign in to comment.