diff --git a/README.md b/README.md
new file mode 100644
index 0000000..dc63f5a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,57 @@
+# RSS reader
+
+RSS reader is a command-line utility which receives RSS URL and prints results in human-readable format.
+
+## Specification
+
+usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT]
+ source
+
+Pure Python command-line RSS reader.
+
+positional arguments:
+ source RSS URL
+
+ooptional arguments:
+ -h, --help show this help message and exit
+ --version Print version info
+ --json Print result as JSON in stdout
+ --verbose Outputs verbose status messages
+ --limit LIMIT Limit news topics if this parameter provided
+ --date DATE Take a date in %Y%m%d format. The news from the specified
+ day will be printed out.
+ --to-html TO_HTML Convert news into html and print in stdout. Argument
+ receives the path where new file will be saved.
+ --to-fb2 TO_FB2 Convert news into fb2 and print in stdout. Argument
+ receives the path where new file will be saved.
+
+
+
+## News caching
+The RSS news are stored in a local storage while reading. Local storage is implemented using shelve. The cached news are dicts with the news itself and its row (html) description which are stored by key. The key consists of date and RSS URL. The cashed news can be read with optional argument --date. Utility creates binary db file 'cache.db' in current directory. If you change current directory, db file from previoгs will not be copied to the current directory.
+
+## JSON structure
+
+{
+ "news": {
+ "feed": "TUT.BY: Новости ТУТ - Главные новости",
+ "items": [
+ {
+ "title": "Охрана, неприкосновенность, пенсия. Канопацкая предлагает закон о гарантиях для экс-президента Беларуси",
+ "link": "https://news.tut.by/economics/662957.html?utm_campaign=news-feed&utm_medium=rss&utm_source=rss-news",
+ "date": "Wed, 27 Nov 2019 15:41:00 +0300",
+ "description": {
+ "text": "Депутат Анна Канопацкая разработала законопроект «О гарантиях президенту Республики Беларусь, прекратившему исполнение своих полномочий, и членам его семьи» и в ближайшее время внесет его на рассмотрение в Палату представителей.",
+ "images": [
+ {
+ "src": "https://img.tyt.by/thumbnails/n/politika/04/4/c5109116a72e8f8029fecf5ca544c9d4.jpg",
+ "alt": "Фото: sb.by"
+ }
+ ],
+ "links": null
+ }
+ }
+ ]
+ }
+}
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..588303e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+feedparser==2.2.1
+bs4==0.0.1
+dateparser==0.7.2
+requests==2.22.0
+lxml==4.4.2
diff --git a/rss/__init__.py b/rss/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rss/cache.py b/rss/cache.py
new file mode 100644
index 0000000..fff253c
--- /dev/null
+++ b/rss/cache.py
@@ -0,0 +1,156 @@
+"""This module provides work with cashed news."""
+
+import logging
+import shelve
+import datetime
+import sys
+import json
+
+import dateparser
+
+from rss.converter_to_fb2 import Fb2Converter
+from rss.converter_to_html import HTMLConverter
+
+
+class Cache:
+ """This class creates cache file, updates it and prints cached news."""
+
+ def __init__(self):
+ logging.info("Cache initialization")
+ self.db_file_name = 'cache.db'
+
+ def _create_key(self, date: str, url: str) -> str:
+ """Create key for db."""
+
+ logging.info('Create key')
+ return date + url
+
+ def _convert_date(self, date: str) -> str:
+ """Convert date to %Y%m%d format."""
+
+ logging.info('Convert date')
+
+ converted_date = dateparser.parse(date)
+ if not converted_date:
+ logging.info("Date isn't clear. Try to parse again")
+ try:
+ converted_date = datetime.datetime.strptime(date, "%a, %d %b %Y %X %z")
+ return converted_date.strftime('%Y%m%d')
+ except Exception:
+ raise Exception('Something wrong with date')
+ return converted_date.strftime('%Y%m%d')
+
+ def insert_news(self, news, row_description, url: str):
+ """Insert news into cache file.
+ Create cache file if it doesn't exist.
+ """
+
+ date = news['date']
+ key = self._create_key(self._convert_date(date), url)
+ logging.info("Open db or create if it doesn't exist for inserting news")
+ with shelve.open(self.db_file_name) as db:
+ if db.get(key):
+ logging.info("Update record")
+ record = db[key]
+ if not list(record['list_of_news']).count(news):
+ record['list_of_news'].append(news)
+ record['list_of_row_descriptions'].append(row_description)
+ db[key] = record
+ else:
+ logging.info("Create new record")
+ record = {}
+ record['list_of_news'] = []
+ record['list_of_news'].append(news)
+ record['list_of_row_descriptions'] = []
+ record['list_of_row_descriptions'].append(row_description)
+ db[key] = record
+
+ def _check_entered_date(self, key: str):
+ """Check length and characters in entered string"""
+
+ logging.info('Check entered date')
+ if len(key) != 8 or not key.isdigit():
+ raise ValueError('Invalid entered date')
+
+ def _get_news(self, key: str) -> list:
+ """Get news from db by key"""
+
+ logging.info("Open db or create if it doesn't exist for getting news")
+ with shelve.open(self.db_file_name) as db:
+ try:
+ record = db[key]
+ return record
+ except KeyError:
+ raise Exception("Can't find the news")
+
+ def set_printing_news(self, url: str, date: str,
+ limit: int, json_mode: bool,
+ fb2_path: str, html_path: str):
+ """Set print format"""
+
+ logging.info("Set print format")
+
+ self._check_entered_date(date)
+ self._check_limit(limit)
+
+ key = self._create_key(date, url)
+ db = self._get_news(key)
+
+ if json_mode:
+ print(json.dumps(db['list_of_news'][:limit], indent=4, ensure_ascii=False))
+ else:
+ self.print_news(db['list_of_news'], limit)
+
+ if fb2_path:
+ conv = Fb2Converter(fb2_path)
+ conv.convert_to_fb2(db['list_of_news'][:limit])
+ conv.save_fb2()
+ if html_path:
+ conv = HTMLConverter(html_path)
+ conv.save_html(conv.convert_to_html(db['list_of_news'][:limit],
+ db['list_of_row_descriptions'][:limit]))
+
+ def _check_limit(self, limit):
+ """Check if the limit > 0."""
+
+ logging.info('Check limit')
+ if limit is not None and limit <= 0:
+ raise ValueError('Invalid limit: limit <= 0')
+
+ def print_news(self, list_of_news, limit):
+ """Print news."""
+
+ logging.info('Start printing cached news')
+ news_number = 1
+ # check if self.list_of_news consists of 1 element
+ if type(list_of_news) == dict:
+ print('№', news_number)
+ self._print_entries(list_of_news)
+ else:
+ for news in list_of_news[:limit]:
+ print('№', news_number)
+ news_number += 1
+ self._print_entries(news)
+
+ def _print_entries(self, news: dict):
+ """Print one news."""
+
+ logging.info('Print one news')
+ print('Title:', news['title'])
+ print('Date:', news['date'])
+ print('Link:', news['link'], '\n')
+
+ if news['description']['text'] != 'Nothing':
+ print(news['description']['text'], '\n')
+
+ if news['description']['images']:
+ print('Images:')
+ for item in news['description']['images']:
+ print(item['src'])
+
+ if news['description']['links']:
+ print('Links:')
+ for item in news['description']['links']:
+ print(item)
+
+ print('-' * 50)
diff --git a/rss/converter_to_fb2.py b/rss/converter_to_fb2.py
new file mode 100644
index 0000000..23c32fd
--- /dev/null
+++ b/rss/converter_to_fb2.py
@@ -0,0 +1,124 @@
+"""This module converts news to fb2 format and saves."""
+
+import os
+import logging
+from base64 import b64encode
+import xml.etree.ElementTree as tree
+from xml.etree.ElementTree import Element
+import xml.dom.minidom as minidom
+
+import requests
+
+
+class Fb2Converter:
+ """Class provides work with conversation to fb2."""
+
+ def __init__(self, path='rss-news.fb2'):
+ logging.info('Fb2Converter initialization')
+ self.path = path
+ self.root = tree.Element('FictionBook')
+ self.root.set('xmlns:l', "http://www.w3.org/1999/xlink")
+ self.description = tree.SubElement(self.root, 'description')
+ self.body = tree.SubElement(self.root, 'body')
+
+ def insert_file_description(self):
+ """Insert file description."""
+
+ logging.info('Insert description')
+ title_info = tree.SubElement(self.description, 'title-info')
+ tree.SubElement(title_info, 'book-title').text = 'RSS news'
+
+ def insert_body(self, list_of_news, limit):
+ """Insert body."""
+
+ logging.info("Insert body")
+ for news in list_of_news[:limit]:
+ self.insert_section(news)
+
+ def insert_section(self, news):
+ """Insert section."""
+
+ logging.info('Insert describing single news section')
+ section = tree.SubElement(self.body, 'section')
+
+ self.insert_tag_p(section, news['title'], True)
+ self.insert_tag_empty_line(section)
+ self.insert_tag_p(section, 'Link: ' + news['link'])
+ self.insert_tag_p(section, 'Date: ' + news['date'])
+ self.insert_tag_empty_line(section)
+
+ if news['description']['images']:
+ try:
+ for img in news['description']['images']:
+ self.insert_image(section, img['src'], img['alt'])
+ except Exception as e:
+ print("Errors with images: ", e)
+
+ self.insert_tag_empty_line(section)
+ self.insert_tag_p(section, news['description']['text'])
+
+ if news['description']['links']:
+ self.insert_tag_empty_line(section)
+ self.insert_tag_p(section, 'Links:')
+ for link in news['description']['links']:
+ self.insert_tag_p(section, link)
+
+ self.insert_tag_empty_line(section)
+ self.insert_tag_p(section, '-'*50)
+
+ def insert_tag_empty_line(self, parent):
+ """Insert empty line """
+
+ logging.info('Insert empty line')
+ tree.SubElement(parent, 'empty-line')
+
+ def insert_tag_p(self, parent, text, strong_mode=None):
+ """
+ Insert tag p with text.
+ If strong_mode then text will be bold.
+ """
+
+ if strong_mode:
+ logging.info('Insert tag p with ')
+ tag_p = tree.SubElement(parent, 'p')
+ tree.SubElement(tag_p, 'strong').text = text
+ else:
+ logging.info('Insert tag p')
+ tree.SubElement(parent, 'p').text = text
+
+ def convert_to_fb2(self, news, limit=None):
+ """Return news converted into fb2."""
+
+ logging.info('Start conversion to fb2')
+ self.insert_file_description()
+ self.insert_body(news, limit)
+
+ def save_fb2(self):
+ """Save fb2 converted news on the received path."""
+
+ logging.info('Save fb2 converted news')
+ with open(self.path, 'w') as file:
+ file.write(tree.tostring(self.root).decode('UTF-8'))
+
+ pretty_xml_as_string = minidom.parse(self.path).toprettyxml()
+
+ with open(self.path, 'w') as file:
+ file.write(pretty_xml_as_string)
+
+ def insert_image(self, parent, img_url, img_name):
+ """Insert image tag in format: ."""
+
+ logging.info('Insert image')
+ image = tree.SubElement(parent, 'image')
+ image.set('l:href', '#' + img_name)
+ binary = tree.SubElement(self.root, 'binary')
+ binary.set('id', img_name)
+ binary.set('content-type', 'image/png')
+ binary.text = self.get_binary_img(img_url)
+
+ def get_binary_img(self, src):
+ """Return img as base64 in string form"""
+
+ logging.info('Get binary img')
+ resource = requests.get(src)
+ return b64encode(resource.content).decode('UTF-8')
diff --git a/rss/converter_to_html.py b/rss/converter_to_html.py
new file mode 100644
index 0000000..1814531
--- /dev/null
+++ b/rss/converter_to_html.py
@@ -0,0 +1,52 @@
+"""This module converts news to HTML and fb2 and saves."""
+
+import os
+import logging
+
+from bs4 import BeautifulSoup
+from lxml import html
+from lxml import etree
+from lxml.builder import E
+
+
+class HTMLConverter:
+ """Class provides work with conversation to HTML."""
+
+ def __init__(self, path='rss-news.html'):
+ logging.info('HTMLConverter initialization')
+ self.path = path
+
+ def convert_to_html(self, list_of_news, list_of_row_descriptions):
+ """Return news converted into HTML."""
+
+ logging.info('Start conversion to HTML')
+ page = (
+ E.html(
+ E.head(E.title("RSS news")),
+ )
+ )
+
+ for single_news, single_description in \
+ zip(list_of_news, list_of_row_descriptions):
+ logging.info('Convert one news')
+ page.append(E.P(
+ E.center(E.h2(single_news['title'])),
+ E.h2(E.a(single_news['link'], href=single_news['link'])),
+ E.h4(single_news['date']),
+ ))
+ page.append(html.fromstring(single_description))
+ page.append(E.BR())
+ page.append(E.BR())
+ page.append(E.HR())
+ return page
+
+ def save_html(self, html_news):
+ """Save HTML converted news on the received path."""
+
+ logging.info('Save HTML converted news')
+ with open(self.path, 'w') as file:
+ file.write(html.tostring(html_news,
+ pretty_print=True,
+ encoding='unicode',
+ method='html',
+ doctype=''))
diff --git a/rss/news.py b/rss/news.py
new file mode 100644
index 0000000..2bd1361
--- /dev/null
+++ b/rss/news.py
@@ -0,0 +1,174 @@
+"""Module contains class related to news."""
+
+import json
+import logging
+import sys
+import datetime
+
+import feedparser
+from bs4 import BeautifulSoup
+
+from rss.cache import Cache
+
+
+class News:
+ """This class parses, processes and outputs news."""
+
+ def __init__(self, url: str, limit=None):
+ logging.info('News initialization')
+
+ self.url = url
+ logging.info('Parsing url')
+ self.feeds = feedparser.parse(self.url)
+ self._check_url()
+
+ self.feed_title = self.feeds.feed.get('title')
+ self.list_of_news = []
+ self.list_of_row_descriptions = []
+
+ self._check_limit(limit)
+ self.make_list_of_news()
+
+ def _check_url(self):
+ """Check if the url is valid."""
+
+ logging.info('Check URL')
+ if self.feeds['bozo'] or self.feeds.status != 200:
+ raise Exception('Something wrong with URL or Internet connection')
+
+ def _check_limit(self, limit):
+ """Check if the limit > 0."""
+
+ logging.info('Check limit')
+ if limit is not None and limit <= 0:
+ raise ValueError('Invalid limit: limit <= 0')
+
+ def print_news(self, limit):
+ """Print news in human-readable format."""
+
+ logging.info("Start printing news")
+ print('\nFeed:', self.feed_title, "\n\n")
+
+ news_number = 1
+ # check if self.list_of_news consists of 1 element
+ if type(self.list_of_news) == dict:
+ print('№', news_number)
+ self._print_entries(self.list_of_news)
+ else:
+ for news in self.list_of_news[:limit]:
+ print('№', news_number)
+ news_number += 1
+ self._print_entries(news)
+
+ def _print_entries(self, news: dict):
+ """Print one news."""
+
+ logging.info('Print one news')
+ print('Title:', news['title'])
+ print('Date:', news['date'])
+ print('Link:', news['link'], '\n')
+
+ if news['description']['text'] != 'Nothing':
+ print(news['description']['text'], '\n')
+
+ if news['description']['images']:
+ print('Images:')
+ for item in news['description']['images']:
+ print(item['src'])
+
+ if news['description']['links']:
+ print('Links:')
+ for item in news['description']['links']:
+ print(item)
+
+ print('-' * 50)
+
+ def _find_date_tag(self, news: dict) -> str:
+ """
+ Find date tag and return its value,
+ or return the current local date if tag not found.
+ """
+
+ logging.info('Find date tag')
+
+ if news.get('published'):
+ return news['published']
+ elif news.get('pubDate'):
+ return news['pubDate']
+ elif news.get('Date:'):
+ return news['Date']
+ else:
+ date = datetime.datetime.now()
+ return date.isoformat()
+
+ def make_list_of_news(self):
+ """Make a list of news.
+
+ type of news: dict
+ """
+
+ logging.info('Make a list of news')
+
+ cache = Cache()
+ for news in self.feeds['entries']:
+ title = news.get('title', 'Unknown')
+ one_news = {'title': title.replace(''', "'"),
+ 'link': news.get('link', 'Unknown'),
+ 'date': self._find_date_tag(news)}
+ one_news.update(self._read_description(news))
+
+ self.list_of_news.append(one_news)
+ cache.insert_news(one_news, self.list_of_row_descriptions[-1], self.url)
+
+ def _read_description(self, news: dict) -> dict:
+ """Return dict with keys 'text', 'images', 'links'.
+
+ 'text' value is description(str)
+ 'images' value is a dict
+ 'links' value is a list of urls
+ """
+
+ logging.info('Get information from description')
+ soup = BeautifulSoup(news.description, features="html.parser")
+
+ logging.info('Get text of description')
+ text = soup.text.replace(''', "'")
+ if not text:
+ text = 'Nothing'
+
+ self.list_of_row_descriptions.append(news.description)
+ return {'description': {'text': text, 'images': self._get_img_list(soup),
+ 'links': self._get_links_list(soup)}}
+
+ def _get_img_list(self, soup) -> list:
+ """Get images src and alt from soup object.
+ Return list of dicts.
+ """
+
+ logging.info('Get images')
+ list_of_images = []
+ images = soup.findAll('img')
+ for image in images:
+ if image.get('src'):
+ list_of_images.append({'src': image['src'], 'alt': image['alt']})
+ return list_of_images if list_of_images else None
+
+ def _get_links_list(self, soup):
+ """Get links from soup object."""
+
+ logging.info('Get set of links')
+ set_of_links = set()
+ for tag in soup.findAll():
+ if tag.get('href'):
+ set_of_links.add(tag['href'])
+ if tag.get('url'):
+ set_of_links.add(tag['url'])
+ return list(set_of_links) if set_of_links else None
+
+ def convert_to_json(self, limit=None):
+ """Return news in JSON format."""
+
+ logging.info('Convert news into JSON format')
+ result = json.dumps({'news': {'feed': self.feed_title, 'items': self.list_of_news[:limit]}},
+ indent=4, ensure_ascii=False)
+ return result
diff --git a/rss/rss_reader.py b/rss/rss_reader.py
new file mode 100644
index 0000000..56349a4
--- /dev/null
+++ b/rss/rss_reader.py
@@ -0,0 +1,118 @@
+"""Module provides work with command line."""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from rss.news import News
+from rss.cache import Cache
+from rss.converter_to_fb2 import Fb2Converter
+from rss.converter_to_html import HTMLConverter
+
+VERSION = "4.0"
+
+
+def add_args(parser):
+ """Add arguments and return new parser."""
+
+ logging.info('Add arguments')
+ parser.add_argument('source', help='RSS URL', type=str)
+ parser.add_argument('--version', help='Print version info', action='version')
+ parser.add_argument('--json', help='Print result as JSON in stdout', action="store_true")
+ parser.add_argument('--verbose', help='Outputs verbose status messages', action="store_true")
+ parser.add_argument('--limit', help='Limit news topics if this parameter provided', type=int)
+ parser.add_argument('--date', help="""Take a date in %%Y%%m%%d format.
+ The news from the specified day will be printed out.""", type=str)
+ parser.add_argument('--to-html', help="""Convert news into html and print in stdout.
+ Argument receives the path where new file will be saved.""", type=str)
+ parser.add_argument('--to-fb2', help="""Convert news into fb2 and print in stdout.
+ Argument receives the path where new file will be saved.""", type=str)
+ return parser
+
+
+def check_path(input_path: str):
+ """Check file path."""
+
+ logging.info('Check path')
+ try:
+ Path(input_path)
+ except Exception as e:
+ ptint("Invalid path: ", e)
+
+def start_parsing(url: str, limit: int, json_mode: bool,
+ fb2_path: str, html_path: str):
+ """This function create rss feed and print news.
+
+ :param url: RSS URL
+ :param limit: news amount that will be printed
+ :param json_mode: if true then news will be printed in JSON format
+ """
+
+ logging.info('Create feed')
+ news = News(url, limit)
+ if json_mode:
+ print(news.convert_to_json(limit))
+ else:
+ news.print_news(limit)
+
+ if fb2_path:
+ conv = Fb2Converter(fb2_path)
+ conv.convert_to_fb2(news.list_of_news[:limit])
+ conv.save_fb2()
+ if html_path:
+ conv = HTMLConverter(html_path)
+ conv.save_html(conv.convert_to_html(news.list_of_news[:limit],
+ news.list_of_row_descriptions[:limit]))
+
+
+def set_verbose_mode(verbose_mode: bool):
+ """Set logging level and format"""
+
+ for handler in logging.root.handlers[:]:
+ logging.root.removeHandler(handler)
+ if verbose_mode:
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
+ logging.info('Set verbose mode')
+
+
+def main():
+ """This function works with arguments, starts parsing."""
+
+ parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader')
+ parser = add_args(parser)
+ parser.version = VERSION
+ args = parser.parse_args()
+
+ set_verbose_mode(args.verbose)
+
+ if args.to_fb2:
+ check_path(args.to_fb2)
+
+ if args.to_html:
+ check_path(args.to_html)
+
+ if args.date:
+ try:
+ cache = Cache()
+ cache.set_printing_news(args.source, args.date, args.limit,
+ args.json, args.to_fb2, args.to_html)
+ except Exception as e:
+ print('Errors with cache:', e)
+ else:
+ try:
+ start_parsing(args.source, args.limit, args.json,
+ args.to_fb2, args.to_html)
+ except Exception as e:
+ print('Errors with parsing:', e)
+
+ logging.info('Program is completed')
+
+
+def run():
+ """Entry point"""
+
+ try:
+ main()
+ except Exception as e:
+ print('There are some errors: ', e)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..fdc70c4
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,20 @@
+from setuptools import setup, find_packages
+
+with open('README.md', 'r') as f:
+ long_description = f.read()
+
+setup(
+ name='rss-reader',
+ version='4.0',
+ description='Pure Python command-line RSS reader',
+ long_description=long_description,
+ url='https://github.com/yanaShcherbich/PythonHomework',
+ author='Yana Shcherbich',
+ author_email='vilikdf@gmail.com',
+ packages=find_packages(),
+ python_requires='>=3.6',
+ install_requires=['feedparser', 'bs4', 'dateparser', 'requests', 'lxml'],
+ entry_points={
+ 'console_scripts': ['rss-reader=rss.rss_reader:run'],
+ }
+)