-
Notifications
You must be signed in to change notification settings - Fork 32
Pashkevich Anton #32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Pashkevich Anton #32
Changes from 7 commits
4e932f6
a49f200
182eef5
0455a5d
c9c9aff
2885f95
3ca7f18
57b4e74
747618a
984f28f
0388792
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Copyright (c) 2019 The Python Packaging Authority | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
##### JSON structure | ||
|
||
``` | ||
{ | ||
"news": { | ||
"feed": "Yahoo News - Latest News & Headlines", | ||
"publications": [ | ||
{ | ||
"title": "Stefanik embraces spotlight at impeachment hearings", | ||
"pub_date": "Fri, 15 Nov 2019 17:55:51 -0500", | ||
"link": "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html", | ||
"description": "[image 2: Stefanik embraces spotlight at impeachment hearings] [2]\nThe second day of the impeachment inquiry\u2019s public hearings, on Friday, began the same way\nas the first: with an attempt by Rep. Elise Stefanik, a New York Republican, to interrupt proceedings\nwith a procedural objection.", | ||
"hrefs": [ | ||
[ | ||
"https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html", | ||
"link" | ||
], | ||
[ | ||
"http://l.yimg.com/uu/api/res/1.2/NRuDo56c6EiwjZH4WOqEZg--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media-mbst-pub-ue1.s3.amazonaws.com/creatr-uploaded-images/2019-11/7a1d0760-07d6-11ea-bef7-f17150574bb2", | ||
"image", | ||
"Stefanik embraces spotlight at impeachment hearings" | ||
] | ||
] | ||
} | ||
] | ||
} | ||
} | ||
``` | ||
|
||
##### Cache description | ||
|
||
News received from feed is cached through database is being created locally. | ||
|
||
The database consists of the only file named "cache.db". It has the following structure: | ||
|
||
| | id | feed | title | pub_date | pub_parsed | link | description | hrefs | | ||
|-----|------|------|-------|----------|------------|------|-------------|-------| | ||
|post | .. | ... | ... | ... | ... | ... | ... | ... | | ||
|
||
All fields except "id" have text type. ID field plays a role of post primary key. | ||
|
||
Hrefs field is composed of all post links including image links and image descriptions. | ||
Usual references section and one for image links are separated by --|-- sequence. | ||
Items in one section are separated by -+- sequence. And -|- is for dividing link, it's type and image description. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
feedparser | ||
bs4 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
""" | ||
this module provides tools for caching news | ||
|
||
it includes functions for work with database and support ones | ||
""" | ||
|
||
import sqlite3 | ||
from re import match | ||
|
||
def init_database(): | ||
""" | ||
this function creates and initizlizes database for caching news | ||
""" | ||
connection_obj = sqlite3.connect('cache.db') | ||
cursor_obj = connection_obj.cursor() | ||
cursor_obj.execute( | ||
'''CREATE TABLE IF NOT EXISTS cache (id integer primary key, feed text, title text, pub_date text, pub_parsed text, link text, description text, hrefs text)''' | ||
) | ||
connection_obj.commit() | ||
|
||
return connection_obj, cursor_obj | ||
|
||
def cache_news(connection_obj, cursor_obj, news): | ||
""" | ||
this function adds parsed news in database | ||
""" | ||
for post in news: | ||
cursor_obj.execute( | ||
'''SELECT id FROM cache WHERE feed=? AND title=? AND pub_date=? AND pub_parsed=? AND link=? AND description=? AND hrefs=?''', | ||
(post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs'])) | ||
) | ||
if cursor_obj.fetchone() is None: | ||
cursor_obj.execute( | ||
'''INSERT INTO cache (feed, title, pub_date, pub_parsed, link, description, hrefs) VALUES (?, ?, ?, ?, ?, ?, ?)''', | ||
(post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs'])) | ||
) | ||
connection_obj.commit() | ||
|
||
return | ||
|
||
def get_cached_news(cursor_obj, date): | ||
""" | ||
this function fetches news from database and return them as a list | ||
""" | ||
cursor_obj.execute('''SELECT * FROM cache WHERE pub_parsed=?''', (date, )) | ||
rows = cursor_obj.fetchall() | ||
|
||
news = [] | ||
for row in rows: | ||
data = {} | ||
data['feed'] = row[1] | ||
data['title'] = row[2] | ||
data['pub_date'] = row[3] | ||
data['pub_parsed'] = row[4] | ||
data['link'] = row[5] | ||
data['description'] = row[6] | ||
|
||
hrefs = row[7].split("--|--") | ||
try: | ||
data['hrefs'] = [tuple(item.split("-|-")) for item in hrefs[0].split("-+-") if item != ''] | ||
data['hrefs'] += [tuple(item.split("-|-")) for item in hrefs[1].split("-+-") if item != ''] | ||
except IndexError: | ||
pass | ||
news.append(data) | ||
|
||
return news | ||
|
||
def hrefs_to_text(link_list): | ||
""" | ||
this function represents the list of links connected to post to text form | ||
""" | ||
res_line = '' | ||
ind = -1 | ||
for tpl in link_list: | ||
if tpl[1] != 'image': | ||
res_line += f"-+-{tpl[0]}-|-{tpl[1]}" | ||
else: | ||
res_line += '--|--' | ||
ind = link_list.index(tpl) | ||
break | ||
|
||
if ind != -1: | ||
for tpl in link_list[ind:]: | ||
res_line += f"{tpl[0]}-|-{tpl[1]}-|-{tpl[2]}-+-" | ||
|
||
return res_line | ||
|
||
def is_valid_date(line): | ||
""" | ||
this function checks a date parameter for suiting date format | ||
""" | ||
date = r"^[1-2][0-9]{3}[0-1][0-9][0-3][0-9]$" | ||
return match(date, line) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
""" | ||
main rss_reader module | ||
""" | ||
|
||
import sys | ||
import argparse | ||
import logging | ||
import html | ||
import json | ||
import feedparser | ||
from bs4 import BeautifulSoup | ||
import cacher | ||
|
||
def init_cli_parser(): | ||
""" | ||
this function initializes command line parser with all nessecary arguments | ||
""" | ||
parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.', prog='rss-reader') | ||
group = parser.add_mutually_exclusive_group(required = True) | ||
group.add_argument("source", type=str, nargs='?', default=None, help="RSS URL") | ||
parser.add_argument('--version', help="print version info", action='version', version='%(prog)s 1.3') | ||
parser.add_argument("--json", help="print result as JSON in stdout", action="store_true") | ||
parser.add_argument("--verbose", help="output verbose status messages", action="store_true") | ||
group.add_argument("--date", type=str, help="print news with provided publish date in stdout") | ||
parser.add_argument("--limit", type=int, help="limit news topics if this parameter provided") | ||
|
||
return parser.parse_args() | ||
|
||
def init_logger(): | ||
""" | ||
this function initizlizes logger connected with log file | ||
""" | ||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
file_handler = logging.FileHandler("rss_reader_logs.txt") | ||
file_handler.setFormatter(logging.Formatter('%(asctime)s -- %(levelname)s -- %(message)s')) | ||
logger.addHandler(file_handler) | ||
|
||
return logger | ||
|
||
def brush_text(line): | ||
""" | ||
this function forms description text into more convinient form | ||
""" | ||
start = 100 | ||
while True: | ||
i = start - 10 | ||
try: | ||
while line[i] != ' ': | ||
i += 1 | ||
except IndexError: | ||
break | ||
line = line[:i] + "\n" + line[i + 1:] | ||
start += 100 | ||
|
||
return line | ||
|
||
def get_post_content(post, feed_title): | ||
""" | ||
this function fetches nessecary elements of a publication from post | ||
""" | ||
data = {} | ||
data['feed'] = feed_title | ||
data['title'] = html.unescape(post.title) | ||
data['pub_date'] = post.published | ||
data['pub_parsed'] = f"{post.published_parsed.tm_year}{post.published_parsed.tm_mon}{post.published_parsed.tm_mday}" | ||
data['link'] = post.link | ||
soup = BeautifulSoup(post.description, 'html.parser') | ||
data['description'] = brush_text(html.unescape(soup.text)) | ||
data['hrefs'] = [(link['href'], 'link') for link in soup.find_all('a') if link.get('href', None)] | ||
for img in soup.find_all('img'): | ||
if not img.get('src', 'Unknown') == '': | ||
data['hrefs'] += [(img.get('src', 'Unknown'), 'image', img.get('alt', ''))] | ||
data['description'] = \ | ||
f"[image {len(data['hrefs'])}: {img.get('alt', '')}] [{len(data['hrefs'])}]\n" + data['description'] | ||
|
||
return data | ||
|
||
def parse_news(url): | ||
""" | ||
this function parses news by given url and returns news list and feed title | ||
""" | ||
feed = feedparser.parse(url) | ||
if feed.bozo == 1: | ||
raise ValueError | ||
|
||
news = [] | ||
for post in feed.entries: | ||
news += [get_post_content(post, feed.feed.title)] | ||
|
||
return news | ||
|
||
def display_news(news): | ||
""" | ||
this function prints news in stdout | ||
""" | ||
if len(news) == 0: | ||
return | ||
|
||
is_same_feed = all([news[0]['feed'] == item['feed'] for item in news]) | ||
if is_same_feed: | ||
print(f"Feed: {news[0]['feed']}\n") | ||
|
||
for item in news: | ||
if not is_same_feed: | ||
print(f"Feed: {item['feed']}\n") | ||
print(f"Title: {item['title']}") | ||
print(f"Publication date: {item['pub_date']}") | ||
print(f"Link: {item['link']}\n") | ||
print(f"{item['description']}\n") | ||
print("Links:") | ||
for index, tpl in enumerate(item['hrefs']): | ||
print(f"[{index + 1}] {tpl[0]} ({tpl[1]})") | ||
print('\n') | ||
|
||
return | ||
|
||
def to_json(news): | ||
""" | ||
this function represents news in json format | ||
""" | ||
for ind, item in enumerate(news): | ||
del item['pub_parsed'] | ||
news[ind] = item | ||
|
||
return json.dumps({'news': news}, indent=2) | ||
|
||
def main(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Возможно стоит разбить эту функцию на несколько маленьких функций |
||
""" | ||
an entry point for a program | ||
""" | ||
logger = init_logger() | ||
args = init_cli_parser() | ||
connection, cursor = cacher.init_database() | ||
|
||
if args.verbose: | ||
logger.addHandler(logging.StreamHandler(sys.stdout)) | ||
logger.info(f"verbose notifications are turned on") | ||
|
||
if args.limit: | ||
if args.limit < 1: | ||
if not args.verbose: | ||
print("error: invalid limit value") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А зачем для печати одного и того же текста использовать и функцию print и логгер? |
||
logger.error(f"invalid limit value") | ||
logger.info(f"end of work -->|") | ||
return | ||
|
||
if args.date: | ||
try: | ||
logger.info(f"checking date..") | ||
if not cacher.is_valid_date(args.date): | ||
raise ValueError | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. есть смысл добавить текстовое описание к каждому исключению, которое тут вызывается |
||
logger.info(f"started fetching data from cache..") | ||
news = cacher.get_cached_news(cursor, args.date) | ||
if len(news) == 0: | ||
raise IndexError | ||
news = news[:args.limit if args.limit else len(news)] | ||
except ValueError: | ||
if not args.verbose: | ||
print("error: invalid date") | ||
logger.error(f"invalid date") | ||
logger.info(f"end of work -->|") | ||
return | ||
except IndexError: | ||
if not args.verbose: | ||
print("no news for this date") | ||
logger.info(f"no news for this date") | ||
logger.info(f"end of work -->|") | ||
return | ||
|
||
if args.source: | ||
logger.info(f"started fetching data (url - {args.source})..") | ||
try: | ||
news = parse_news(args.source) | ||
logger.info(f"started caching data..") | ||
cacher.cache_news(connection, cursor, news) | ||
news = news[:args.limit if args.limit else len(news)] | ||
except ValueError: | ||
if not args.verbose: | ||
print(f"error: not well-formed xml or broken access to the Internet") | ||
logger.error(f"not well-formed xml or broken access to the Internet") | ||
logger.info(f"end of work -->|") | ||
return | ||
|
||
if args.limit: | ||
logger.info(f"the limit of publications to show - {args.limit}") | ||
|
||
if not args.json: | ||
logger.info(f"displaying news..\n") | ||
display_news(news) | ||
else: | ||
logger.info(f"displaying news in json format..\n") | ||
print(to_json(news)) | ||
|
||
logger.info(f"\npublications were successfully shown - {len(news)}") | ||
logger.info(f"end of work -->|") | ||
|
||
return | ||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import setuptools | ||
|
||
with open("README.md", "r") as f: | ||
long_description = f.read() | ||
|
||
setuptools.setup( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Согласно тех заданию, должна экспортироваться утилита rss-reader # rss-reader
bash: rss-reader: command not found |
||
name="rss-reader", | ||
version="1.2", | ||
author="Anton Pashkevich", | ||
author_email="mario.lazer@mail.ru", | ||
description="Pure Python command-line RSS reader", | ||
long_description=long_description, | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/prague15031939/PythonHomework", | ||
packages=setuptools.find_packages(), | ||
classifiers=[ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
], | ||
python_requires='>=3.8', | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.