Skip to content

Pashkevich Anton #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions final_task/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (c) 2019 The Python Packaging Authority

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
44 changes: 44 additions & 0 deletions final_task/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
##### JSON structure

```
{
"news": {
"feed": "Yahoo News - Latest News & Headlines",
"publications": [
{
"title": "Stefanik embraces spotlight at impeachment hearings",
"pub_date": "Fri, 15 Nov 2019 17:55:51 -0500",
"link": "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html",
"description": "[image 2: Stefanik embraces spotlight at impeachment hearings] [2]\nThe second day of the impeachment inquiry\u2019s public hearings, on Friday, began the same way\nas the first: with an attempt by Rep. Elise Stefanik, a New York Republican, to interrupt proceedings\nwith a procedural objection.",
"hrefs": [
[
"https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html",
"link"
],
[
"http://l.yimg.com/uu/api/res/1.2/NRuDo56c6EiwjZH4WOqEZg--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media-mbst-pub-ue1.s3.amazonaws.com/creatr-uploaded-images/2019-11/7a1d0760-07d6-11ea-bef7-f17150574bb2",
"image",
"Stefanik embraces spotlight at impeachment hearings"
]
]
}
]
}
}
```

##### Cache description

News received from feed is cached through database is being created locally.

The database consists of the only file named "cache.db". It has the following structure:

| | id | feed | title | pub_date | pub_parsed | link | description | hrefs |
|-----|------|------|-------|----------|------------|------|-------------|-------|
|post | .. | ... | ... | ... | ... | ... | ... | ... |

All fields except "id" have text type. ID field plays a role of post primary key.

Hrefs field is composed of all post links including image links and image descriptions.
Usual references section and one for image links are separated by --|-- sequence.
Items in one section are separated by -+- sequence. And -|- is for dividing link, it's type and image description.
2 changes: 2 additions & 0 deletions final_task/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
feedparser
bs4
Empty file.
93 changes: 93 additions & 0 deletions final_task/rss_reader/cacher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
this module provides tools for caching news

it includes functions for work with database and support ones
"""

import sqlite3
from re import match

def init_database():
"""
this function creates and initizlizes database for caching news
"""
connection_obj = sqlite3.connect('cache.db')
cursor_obj = connection_obj.cursor()
cursor_obj.execute(
'''CREATE TABLE IF NOT EXISTS cache (id integer primary key, feed text, title text, pub_date text, pub_parsed text, link text, description text, hrefs text)'''
)
connection_obj.commit()

return connection_obj, cursor_obj

def cache_news(connection_obj, cursor_obj, news):
"""
this function adds parsed news in database
"""
for post in news:
cursor_obj.execute(
'''SELECT id FROM cache WHERE feed=? AND title=? AND pub_date=? AND pub_parsed=? AND link=? AND description=? AND hrefs=?''',
(post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs']))
)
if cursor_obj.fetchone() is None:
cursor_obj.execute(
'''INSERT INTO cache (feed, title, pub_date, pub_parsed, link, description, hrefs) VALUES (?, ?, ?, ?, ?, ?, ?)''',
(post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs']))
)
connection_obj.commit()

return

def get_cached_news(cursor_obj, date):
"""
this function fetches news from database and return them as a list
"""
cursor_obj.execute('''SELECT * FROM cache WHERE pub_parsed=?''', (date, ))
rows = cursor_obj.fetchall()

news = []
for row in rows:
data = {}
data['feed'] = row[1]
data['title'] = row[2]
data['pub_date'] = row[3]
data['pub_parsed'] = row[4]
data['link'] = row[5]
data['description'] = row[6]

hrefs = row[7].split("--|--")
try:
data['hrefs'] = [tuple(item.split("-|-")) for item in hrefs[0].split("-+-") if item != '']
data['hrefs'] += [tuple(item.split("-|-")) for item in hrefs[1].split("-+-") if item != '']
except IndexError:
pass
news.append(data)

return news

def hrefs_to_text(link_list):
"""
this function represents the list of links connected to post to text form
"""
res_line = ''
ind = -1
for tpl in link_list:
if tpl[1] != 'image':
res_line += f"-+-{tpl[0]}-|-{tpl[1]}"
else:
res_line += '--|--'
ind = link_list.index(tpl)
break

if ind != -1:
for tpl in link_list[ind:]:
res_line += f"{tpl[0]}-|-{tpl[1]}-|-{tpl[2]}-+-"

return res_line

def is_valid_date(line):
"""
this function checks a date parameter for suiting date format
"""
date = r"^[1-2][0-9]{3}[0-1][0-9][0-3][0-9]$"
return match(date, line)
201 changes: 201 additions & 0 deletions final_task/rss_reader/rss_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""
main rss_reader module
"""

import sys
import argparse
import logging
import html
import json
import feedparser
from bs4 import BeautifulSoup
import cacher

def init_cli_parser():
"""
this function initializes command line parser with all nessecary arguments
"""
parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.', prog='rss-reader')
group = parser.add_mutually_exclusive_group(required = True)
group.add_argument("source", type=str, nargs='?', default=None, help="RSS URL")
parser.add_argument('--version', help="print version info", action='version', version='%(prog)s 1.3')
parser.add_argument("--json", help="print result as JSON in stdout", action="store_true")
parser.add_argument("--verbose", help="output verbose status messages", action="store_true")
group.add_argument("--date", type=str, help="print news with provided publish date in stdout")
parser.add_argument("--limit", type=int, help="limit news topics if this parameter provided")

return parser.parse_args()

def init_logger():
"""
this function initizlizes logger connected with log file
"""
logger = logging.getLogger()
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler("rss_reader_logs.txt")
file_handler.setFormatter(logging.Formatter('%(asctime)s -- %(levelname)s -- %(message)s'))
logger.addHandler(file_handler)

return logger

def brush_text(line):
"""
this function forms description text into more convinient form
"""
start = 100
while True:
i = start - 10
try:
while line[i] != ' ':
i += 1
except IndexError:
break
line = line[:i] + "\n" + line[i + 1:]
start += 100

return line

def get_post_content(post, feed_title):
"""
this function fetches nessecary elements of a publication from post
"""
data = {}
data['feed'] = feed_title
data['title'] = html.unescape(post.title)
data['pub_date'] = post.published
data['pub_parsed'] = f"{post.published_parsed.tm_year}{post.published_parsed.tm_mon}{post.published_parsed.tm_mday}"
data['link'] = post.link
soup = BeautifulSoup(post.description, 'html.parser')
data['description'] = brush_text(html.unescape(soup.text))
data['hrefs'] = [(link['href'], 'link') for link in soup.find_all('a') if link.get('href', None)]
for img in soup.find_all('img'):
if not img.get('src', 'Unknown') == '':
data['hrefs'] += [(img.get('src', 'Unknown'), 'image', img.get('alt', ''))]
data['description'] = \
f"[image {len(data['hrefs'])}: {img.get('alt', '')}] [{len(data['hrefs'])}]\n" + data['description']

return data

def parse_news(url):
"""
this function parses news by given url and returns news list and feed title
"""
feed = feedparser.parse(url)
if feed.bozo == 1:
raise ValueError

news = []
for post in feed.entries:
news += [get_post_content(post, feed.feed.title)]

return news

def display_news(news):
"""
this function prints news in stdout
"""
if len(news) == 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if not news:
    return None

return

is_same_feed = all([news[0]['feed'] == item['feed'] for item in news])
if is_same_feed:
print(f"Feed: {news[0]['feed']}\n")

for item in news:
if not is_same_feed:
print(f"Feed: {item['feed']}\n")
print(f"Title: {item['title']}")
print(f"Publication date: {item['pub_date']}")
print(f"Link: {item['link']}\n")
print(f"{item['description']}\n")
print("Links:")
for index, tpl in enumerate(item['hrefs']):
print(f"[{index + 1}] {tpl[0]} ({tpl[1]})")
print('\n')

return

def to_json(news):
"""
this function represents news in json format
"""
for ind, item in enumerate(news):
del item['pub_parsed']
news[ind] = item

return json.dumps({'news': news}, indent=2)

def main():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возможно стоит разбить эту функцию на несколько маленьких функций

"""
an entry point for a program
"""
logger = init_logger()
args = init_cli_parser()
connection, cursor = cacher.init_database()

if args.verbose:
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.info(f"verbose notifications are turned on")

if args.limit:
if args.limit < 1:
if not args.verbose:
print("error: invalid limit value")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А зачем для печати одного и того же текста использовать и функцию print и логгер?
По идее можно обойтись только логгером

logger.error(f"invalid limit value")
logger.info(f"end of work -->|")
return

if args.date:
try:
logger.info(f"checking date..")
if not cacher.is_valid_date(args.date):
raise ValueError
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

есть смысл добавить текстовое описание к каждому исключению, которое тут вызывается

logger.info(f"started fetching data from cache..")
news = cacher.get_cached_news(cursor, args.date)
if len(news) == 0:
raise IndexError
news = news[:args.limit if args.limit else len(news)]
except ValueError:
if not args.verbose:
print("error: invalid date")
logger.error(f"invalid date")
logger.info(f"end of work -->|")
return
except IndexError:
if not args.verbose:
print("no news for this date")
logger.info(f"no news for this date")
logger.info(f"end of work -->|")
return

if args.source:
logger.info(f"started fetching data (url - {args.source})..")
try:
news = parse_news(args.source)
logger.info(f"started caching data..")
cacher.cache_news(connection, cursor, news)
news = news[:args.limit if args.limit else len(news)]
except ValueError:
if not args.verbose:
print(f"error: not well-formed xml or broken access to the Internet")
logger.error(f"not well-formed xml or broken access to the Internet")
logger.info(f"end of work -->|")
return

if args.limit:
logger.info(f"the limit of publications to show - {args.limit}")

if not args.json:
logger.info(f"displaying news..\n")
display_news(news)
else:
logger.info(f"displaying news in json format..\n")
print(to_json(news))

logger.info(f"\npublications were successfully shown - {len(news)}")
logger.info(f"end of work -->|")

return

if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions final_task/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import setuptools

with open("README.md", "r") as f:
long_description = f.read()

setuptools.setup(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Согласно тех заданию, должна экспортироваться утилита rss-reader
после установки пакета через pip install . такой утилиты не появляется

# rss-reader
bash: rss-reader: command not found

name="rss-reader",
version="1.2",
author="Anton Pashkevich",
author_email="mario.lazer@mail.ru",
description="Pure Python command-line RSS reader",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/prague15031939/PythonHomework",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.8',
)