diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7ea5c38
--- /dev/null
+++ b/README.md
@@ -0,0 +1,29 @@
+# Pure Python RSS Reader [PythonHomework]
+
+Python version: v3.8
+
+Current version: v0.5
+
+Code checking: Code correspond to pep8
+#### Usage:
+```shell
+usage: __main__.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [--to-pdf] [--to-html] source
+
+Pure Python command-line RSS reader.
+
+positional arguments:
+ source RSS URL
+
+optional arguments:
+ -h, --help show this help message and exit
+ --version Print version info
+ --json Print result as JSON in stdout
+ --verbose Outputs verbose status messages
+ --limit LIMIT Limit news topics if this parameter provided
+ --date DATE Show cached news by input date
+ --to-pdf Convert news to pdf format
+ --to-html Convert news to html format
+ ```
+ JSON scheme is described in `json_schema.json`.
+
+ News cache in json file `cached_news.json` in root application directory.
diff --git a/json_schema.json b/json_schema.json
new file mode 100644
index 0000000..35692b1
--- /dev/null
+++ b/json_schema.json
@@ -0,0 +1,31 @@
+{
+ "$schema": "http://json-schema.org/schema#",
+ "title": "feed",
+ "type": "object",
+ "required": ["title", "date", "text", "link", "hrefs"],
+ "properties": {
+ "title": {
+ "type": "string",
+ "description": "News title"
+ },
+ "date": {
+ "type": "date",
+ "description": "News published date"
+ },
+ "text": {
+ "type": "string",
+ "description": "News text"
+ },
+ "link": {
+ "type": "string",
+ "description": "News static link"
+ },
+ "hrefs": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "description": "News hrefs"
+ }
+ }
+ }
+ }
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..be22e7f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+feedparser==5.2.1
+requests==2.22.0
+tldextract==2.2.2
+fpdf==1.7.2
+colorama==0.4.1
+lxml==4.4.2
\ No newline at end of file
diff --git a/rss_reader/FreeSans.ttf b/rss_reader/FreeSans.ttf
new file mode 100644
index 0000000..9db9585
Binary files /dev/null and b/rss_reader/FreeSans.ttf differ
diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rss_reader/__main__.py b/rss_reader/__main__.py
new file mode 100644
index 0000000..c9fc23b
--- /dev/null
+++ b/rss_reader/__main__.py
@@ -0,0 +1,43 @@
+"""Main module"""
+
+import argparse
+import logging
+
+from colorama import init
+
+from .rss_reader import NewsReader
+
+
+def main():
+ init() # colorama init
+ args = parse_args()
+
+ if args.verbose:
+ logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
+ else:
+ logging.basicConfig(format="%(levelname)s: %(message)s")
+
+ reader = NewsReader(args.source, args.limit, args.json, args.date, args.to_pdf, args.to_html, args.colorize)
+ reader.parse_url()
+
+ reader.print_news()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.')
+
+ parser.add_argument('source', help='RSS URL')
+ parser.add_argument('--version', help='Print version info', action='version', version='%(prog)s 0.4')
+ parser.add_argument('--json', help='Print result as JSON in stdout', action='store_true')
+ parser.add_argument('--verbose', help='Outputs verbose status messages', action='store_true')
+ parser.add_argument('--limit', help='Limit news topics if this parameter provided', type=int)
+ parser.add_argument('--date', help='Show cached news by input date', type=str)
+ parser.add_argument('--to-pdf', help='Convert news to pdf format', action='store_true')
+ parser.add_argument('--to-html', help='Convert news to html format', action='store_true')
+ parser.add_argument('--colorize', help='Colorize output text', action='store_true')
+
+ return parser.parse_args()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/rss_reader/date_validation.py b/rss_reader/date_validation.py
new file mode 100644
index 0000000..951e0a0
--- /dev/null
+++ b/rss_reader/date_validation.py
@@ -0,0 +1,8 @@
+from datetime import datetime
+
+def is_valid_date(date, pattern):
+ try:
+ datetime.strptime(date, pattern)
+ return True
+ except ValueError:
+ return False
\ No newline at end of file
diff --git a/rss_reader/html_converter.py b/rss_reader/html_converter.py
new file mode 100644
index 0000000..14ca24d
--- /dev/null
+++ b/rss_reader/html_converter.py
@@ -0,0 +1,35 @@
+"""HTML converter module"""
+
+import logging
+
+
+class HTMLConverter:
+ def __init__(self, data, file_name):
+ self.data = data
+ self.file_name = file_name
+
+ def dump(self):
+ """Create and fill HTML-file"""
+ logging.info("Create and fill HTML-file")
+
+ html_code = f'\n
\n{self.file_name}\n\n\n\n'
+
+ for element in self.data:
+ html_code += f'{element["title"]}
\n'
+ html_code += f'{element["date"]}
\n'
+ html_code += f'{element["link"]} \
+
\n'
+ html_code += f'{element["text"]}
\n'
+
+ for href in element["hrefs"]:
+ html_code += f'
\n'
+
+ html_code += '\n'
+
+ with open(f'{self.file_name}.html', 'w+', encoding='utf-8') as html_file:
+ html_file.write(html_code)
diff --git a/rss_reader/json_formatter.py b/rss_reader/json_formatter.py
new file mode 100644
index 0000000..f3df148
--- /dev/null
+++ b/rss_reader/json_formatter.py
@@ -0,0 +1,33 @@
+"""Module with tools for working with Json"""
+
+import json
+import logging
+
+
+class NewsJsonFormatter:
+ def __init__(self):
+ self.data = {}
+
+ def __str__(self):
+ """Compute json-file for print news to console in json-format"""
+ logging.info('Compute json-file for print news to console in json-format')
+
+ return json.dumps(self.data, ensure_ascii=False, indent=4)
+
+ def write_to_file(self):
+ """Write json-data to file"""
+ logging.info('Write json-data to file')
+
+ with open('data.json', 'w', encoding='utf-8') as outfile:
+ json.dump(self.data, outfile, ensure_ascii=False, indent=4)
+
+ def format(self, news):
+ """Format data to json appereance"""
+ logging.info('Format data to json appereance')
+
+ self.data = {
+ 'feed': []
+ }
+
+ for element in news:
+ self.data['feed'].append(element)
diff --git a/rss_reader/news_cacher.py b/rss_reader/news_cacher.py
new file mode 100644
index 0000000..18e29b6
--- /dev/null
+++ b/rss_reader/news_cacher.py
@@ -0,0 +1,60 @@
+"""News cacher module"""
+
+import logging
+import json
+
+from datetime import datetime
+from os.path import abspath, getsize, exists
+
+from .json_formatter import NewsJsonFormatter
+
+
+class NewsCacher:
+ def __init__(self, file_name, source):
+ self.file_name = file_name
+ self.source = source
+ self.date = datetime.now().strftime('%Y%m%d')
+ self.data = None
+
+ def get_cached_news(self, date, limit):
+ """Get cached news from json file"""
+ logging.info("Get cached news from json file")
+
+ with open(self.file_name, 'r', encoding='utf-8') as json_file:
+ self.data = json.load(json_file)
+
+ try:
+ return self.data[self.source][date][:limit]
+ except (KeyError, TypeError):
+ raise ValueError
+
+ def cache(self, json_object):
+ """Cache news to json file"""
+ logging.info("Cache news to json file")
+
+ self.data = {
+ self.source: {
+ self.date: []
+ }
+ }
+
+ if not exists(abspath(self.file_name)):
+ open(self.file_name, 'w').close()
+
+ file_size = getsize(abspath(self.file_name))
+
+ if file_size != 0:
+ with open(self.file_name, 'r', encoding='utf-8') as json_file:
+ self.data = json.load(json_file)
+
+ if self.source not in self.data:
+ self.data.update({self.source: {}})
+ if self.date not in self.data[self.source]:
+ self.data[self.source].update({self.date: []})
+
+ for element in json_object:
+ if element not in self.data[self.source][self.date]:
+ self.data[self.source][self.date].append(element)
+
+ with open(self.file_name, 'w', encoding='utf-8') as outfile:
+ json.dump(self.data, outfile, ensure_ascii=False, indent=4)
diff --git a/rss_reader/pdf_converter.py b/rss_reader/pdf_converter.py
new file mode 100644
index 0000000..e6d2dc4
--- /dev/null
+++ b/rss_reader/pdf_converter.py
@@ -0,0 +1,67 @@
+"""PDF converter module"""
+
+import logging
+import requests
+
+from fpdf import SYSTEM_TTFONTS, FPDF
+
+SYSTEM_TTFONTS = ''
+
+
+class PDFConverter:
+ def __init__(self, data, file_name):
+ self.data = data
+ self.file_name = file_name
+ self.pdf = FPDF(orientation='P', unit='mm', format='A4')
+
+ def dump(self):
+ """Create and fill PDF-file"""
+ logging.info("Create and fill PDF-file")
+
+ margin = 5
+
+ self.pdf.add_page()
+ self.pdf.set_auto_page_break(True, 10 * margin)
+
+ for element in self.data:
+ self.pdf.add_font('FreeSans', '', 'FreeSans.ttf', uni=True)
+ self.pdf.set_fill_color(62, 147, 96)
+
+ self.pdf.set_text_color(255, 255, 255)
+ self.pdf.set_font("FreeSans", size=14)
+ self.pdf.multi_cell(0, 8, txt=element["title"], align="C", fill=1)
+ self.pdf.set_fill_color(90, 167, 120)
+ self.pdf.set_font("FreeSans", size=10)
+ self.pdf.multi_cell(0, 5, txt=element["date"], align="R", fill=1)
+ self.pdf.set_font("FreeSans", size=10)
+ self.pdf.multi_cell(0, 5, txt=element["link"], align="R", fill=1)
+
+ self.pdf.set_fill_color(229, 229, 229)
+ self.pdf.set_text_color(0, 0, 0)
+ self.pdf.set_font("FreeSans", size=12)
+ self.pdf.multi_cell(0, 6, txt=element["text"], align="J", fill=1)
+
+ self.pdf.set_fill_color(242, 242, 242)
+ self.pdf.set_text_color(0, 0, 0)
+ self.pdf.set_font("FreeSans", size=10)
+
+ self.pdf.ln(margin)
+
+ for href in element["hrefs"]:
+ page_height = 300
+ image_height = 50
+
+ try:
+ if page_height - self.pdf.get_y() < image_height + margin:
+ self.pdf.add_page()
+
+ if href[-4:] == '.png':
+ self.pdf.image(href, x=self.pdf.get_x() + image_height + margin, y=self.pdf.get_y(),
+ h=image_height)
+ self.pdf.ln(image_height + margin)
+ except Exception:
+ logging.error('Cant get an image from url')
+
+ self.pdf.multi_cell(0, margin, txt="", align="J")
+
+ self.pdf.output(f'{self.file_name}.pdf')
diff --git a/rss_reader/rss_reader.py b/rss_reader/rss_reader.py
new file mode 100644
index 0000000..e0eb8aa
--- /dev/null
+++ b/rss_reader/rss_reader.py
@@ -0,0 +1,181 @@
+"""RSS-reader module"""
+
+import re
+import logging
+import feedparser
+import lxml.html
+import lxml.html.clean
+
+from tldextract import extract
+from colorama import Fore, Back, Style
+
+from .news_cacher import NewsCacher
+from .json_formatter import NewsJsonFormatter
+from .pdf_converter import PDFConverter
+from .html_converter import HTMLConverter
+from .date_validation import is_valid_date
+
+
+class NewsReader:
+ def __init__(self, link, limit, json, date, convert_to_pdf, convert_to_html, colorize):
+ self.link = link
+ self.limit = limit
+ self.json = json
+ self.hrefs = []
+ self.news = []
+ self.date = date
+ self.json_object = NewsJsonFormatter()
+ self.convert_to_pdf = convert_to_pdf
+ self.convert_to_html = convert_to_html
+ self.colorize = colorize
+
+ ext_site_name = extract(self.link)
+ site_name = f'{ext_site_name.domain}.{ext_site_name.suffix}'
+
+ self.cacher_object = NewsCacher('cached_news.json', site_name)
+
+ self.DATE_KEYS = ['published', 'updated', 'pubDate']
+ self.MEDIA_KEYS = ['media_thumbnail', 'media_content']
+
+ def parse_url(self):
+ """Get RSS xml-file from url"""
+ logging.info('Get RSS XML-file from url')
+
+ self.feed = feedparser.parse(self.link)
+ try:
+ self.parse_xml(self.feed.entries[:self.limit])
+ except lxml.etree.ParserError:
+ logging.error('Unable to parse url')
+ exit()
+
+ def parse_xml(self, source):
+ """Parse xml-file to news array"""
+ logging.info('Parse XML-file to news array')
+
+ for item in source:
+ content = []
+ pub_date = ''
+
+ for key in self.DATE_KEYS:
+ if key in item:
+ pub_date = key
+
+ for key in self.MEDIA_KEYS:
+ if key in item:
+ for element in item[key]:
+ content.append(element['url'])
+
+ self.news.append({"title": self.clean_html_text(item.title), "date": item[pub_date],
+ "text": self.clean_html_text(self.strip_html_string(item.description)),
+ "link": item.link.split('?')[0], "hrefs": content})
+
+ if self.date is None:
+ self.cacher_object.cache(self.news)
+ else:
+ if not is_valid_date(self.date, '%Y%m%d'):
+ logging.error("Unexpected date format")
+ exit()
+
+ if self.json is True:
+ self.json_object.format(self.news)
+
+ if self.convert_to_html is True:
+ if self.date is not None:
+ try:
+ news = self.cacher_object.get_cached_news(self.date, self.limit)
+ except ValueError:
+ logging.error("News for this date not found")
+ exit()
+ except FileNotFoundError:
+ logging.error("Cache file not found")
+ exit()
+
+ html = HTMLConverter(news, 'rss_reader_news')
+ else:
+ html = HTMLConverter(self.news, 'rss_reader_news')
+
+ html.dump()
+ print('HTML file created in current directory')
+
+ if self.convert_to_pdf is True:
+ if self.date is not None:
+ try:
+ news = self.cacher_object.get_cached_news(self.date, self.limit)
+ except ValueError:
+ logging.error("News for this date not found")
+ exit()
+ except FileNotFoundError:
+ logging.error("Cache file not found")
+ exit()
+
+ pdf = PDFConverter(news, 'rss_reader_news')
+ else:
+ pdf = PDFConverter(self.news, 'rss_reader_news')
+
+ pdf.dump()
+ print('PDF file created in current directory')
+
+ def print_news(self):
+ """Print news to console"""
+ logging.info('Print news to console')
+
+ if self.date is not None:
+ try:
+ news = self.cacher_object.get_cached_news(self.date, self.limit)
+ except ValueError:
+ logging.error("News for this date not found")
+ exit()
+ except FileNotFoundError:
+ logging.error("Cache file not found")
+ exit()
+
+ if self.json is True:
+ self.json_object.format(news)
+ print(self.json_object)
+ else:
+ for element in news:
+ self.print_one_news(element)
+ print()
+ elif self.json is True:
+ print(self.json_object)
+ else:
+ for element in self.news:
+ self.print_one_news(element)
+ print()
+
+ def print_one_news(self, element):
+ """Print one news to console"""
+
+ if self.colorize is True:
+ print(f'{Fore.YELLOW + Back.BLACK + Style.BRIGHT}Title:{Fore.GREEN + Back.BLACK} {element["title"]}')
+ print(f'{Fore.YELLOW + Back.BLACK}Date: {Fore.RED + Back.BLACK} {element["date"]}')
+ print(f'{Fore.YELLOW + Back.BLACK}Link: {Fore.CYAN} {element["link"]}')
+ print(f'{Fore.YELLOW + Back.BLACK}News text:')
+ print(f'{Fore.WHITE + Back.BLACK}{element["text"]}')
+ print(f'{Fore.YELLOW + Back.BLACK}Hrefs:')
+ for href in element["hrefs"]:
+ print(f'{Fore.CYAN}| {href}')
+ else:
+ print(f'Title: {element["title"]}')
+ print(f'Date: {element["date"]}')
+ print(f'Link: {element["link"]}')
+ print('News text:')
+ print(element["text"])
+ print('Hrefs:')
+ for href in element["hrefs"]:
+ print(f'| {href}')
+
+ def strip_html_string(self, string):
+ """Remove html tags from a string"""
+ logging.info("Remove html tags from a string")
+
+ strip_string = re.compile('<.*?>')
+ return re.sub(strip_string, '', string)
+
+ def clean_html_text(self, string):
+ """Clean html string"""
+
+ doc = lxml.html.fromstring(string)
+ cleaner = lxml.html.clean.Cleaner(style=True)
+ doc = cleaner.clean_html(doc)
+ return doc.text_content()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a9712df
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,19 @@
+import setuptools
+
+setuptools.setup(
+ name="rss-reader",
+ version="0.5",
+ author="Archeex",
+ author_email="qsanich@gmail.com",
+ description="Pure Python command-line RSS reader",
+ packages=setuptools.find_packages(),
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+ ],
+ entry_points={
+ 'console_scripts': ['rss_reader = rss_reader.__main__:main']
+ },
+ python_requires='>=3.8'
+)
\ No newline at end of file
diff --git a/tests/test_json_formatter.py b/tests/test_json_formatter.py
new file mode 100644
index 0000000..cd82fa2
--- /dev/null
+++ b/tests/test_json_formatter.py
@@ -0,0 +1,10 @@
+import unittest
+from rss_reader.json_formatter import NewsJsonFormatter
+
+class TestNewsReader(unittest.TestCase):
+ def test_format(self):
+ news = '{"Title": "hello"}'
+ self.assertEqual(NewsJsonFormatter.format(news), '{{"feed": {[{"Title": "hello"}]}}}')
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/tests/test_rss_reader.py b/tests/test_rss_reader.py
new file mode 100644
index 0000000..eac9414
--- /dev/null
+++ b/tests/test_rss_reader.py
@@ -0,0 +1,31 @@
+import unittest
+from rss_reader.rss_reader import NewsReader
+
+class TestNewsReader(unittest.TestCase):
+ def test_parse_xml(self):
+ computed_xml = "[{'title': 'Министр внутренних дел: Мы работаем не на показатели, а на граждан', 'date': 'Sun, 01 Dec 2019 21:06:00 +0300', 'text': 'МВД меняет подходы к наркопреступлениям, ставка будет на профилактику и на выявление крупных поставщиков, заявил в эфире программы «Контуры» телеканала ОНТ министр внутренних дел Юрий Караев.', 'link': 'https://news.tut.by/society/663397.html', 'hrefs': ['https://img.tyt.by/n/buryakina/01/7/karaev_20190410_bur_tutby_phsl-9055.jpg']}]"
+ test_xml = """
+{
+ 'title':'Министр внутренних дел: Мы работаем не на показатели, а на граждан',
+ 'link':'https://news.tut.by/society/663397.html?utm_campaign=news-feed&utm_medium=rss&utm_source=rss-news',
+ 'published':'Sun, 01 Dec 2019 21:06:00 +0300',
+ 'summary':'
МВД меняет подходы к наркопреступлениям, ставка будет на профилактику и на выявление крупных поставщиков, заявил в эфире программы «Контуры» телеканала ОНТ министр внутренних дел Юрий Караев.
',
+ 'media_content':[
+ {
+ 'url':'https://img.tyt.by/n/buryakina/01/7/karaev_20190410_bur_tutby_phsl-9055.jpg',
+ 'type':'image/jpeg',
+ 'medium':'image',
+ 'height':'800',
+ 'width':'1200',
+ 'filesize':'371255'
+ }
+ ]
+}
+"""
+ self.assertEqual(NewsReader.parse_xml(test_xml), computed_xml)
+
+ def test_clean_html_text(self):
+ self.assertEqual(NewsReader.clean_html_text("Some String"), "SomeString")
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file