diff --git a/README.md b/README.md new file mode 100644 index 0000000..7ea5c38 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Pure Python RSS Reader [PythonHomework] + +Python version: v3.8 + +Current version: v0.5 + +Code checking: Code correspond to pep8 +#### Usage: +```shell +usage: __main__.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [--to-pdf] [--to-html] source + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided + --date DATE Show cached news by input date + --to-pdf Convert news to pdf format + --to-html Convert news to html format + ``` + JSON scheme is described in `json_schema.json`. + + News cache in json file `cached_news.json` in root application directory. diff --git a/json_schema.json b/json_schema.json new file mode 100644 index 0000000..35692b1 --- /dev/null +++ b/json_schema.json @@ -0,0 +1,31 @@ +{ + "$schema": "http://json-schema.org/schema#", + "title": "feed", + "type": "object", + "required": ["title", "date", "text", "link", "hrefs"], + "properties": { + "title": { + "type": "string", + "description": "News title" + }, + "date": { + "type": "date", + "description": "News published date" + }, + "text": { + "type": "string", + "description": "News text" + }, + "link": { + "type": "string", + "description": "News static link" + }, + "hrefs": { + "type": "array", + "items": { + "type": "string", + "description": "News hrefs" + } + } + } + } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..be22e7f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +feedparser==5.2.1 +requests==2.22.0 +tldextract==2.2.2 +fpdf==1.7.2 +colorama==0.4.1 +lxml==4.4.2 \ No newline at end of file diff --git a/rss_reader/FreeSans.ttf b/rss_reader/FreeSans.ttf new file mode 100644 index 0000000..9db9585 Binary files /dev/null and b/rss_reader/FreeSans.ttf differ diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader/__main__.py b/rss_reader/__main__.py new file mode 100644 index 0000000..c9fc23b --- /dev/null +++ b/rss_reader/__main__.py @@ -0,0 +1,43 @@ +"""Main module""" + +import argparse +import logging + +from colorama import init + +from .rss_reader import NewsReader + + +def main(): + init() # colorama init + args = parse_args() + + if args.verbose: + logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) + else: + logging.basicConfig(format="%(levelname)s: %(message)s") + + reader = NewsReader(args.source, args.limit, args.json, args.date, args.to_pdf, args.to_html, args.colorize) + reader.parse_url() + + reader.print_news() + + +def parse_args(): + parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.') + + parser.add_argument('source', help='RSS URL') + parser.add_argument('--version', help='Print version info', action='version', version='%(prog)s 0.4') + parser.add_argument('--json', help='Print result as JSON in stdout', action='store_true') + parser.add_argument('--verbose', help='Outputs verbose status messages', action='store_true') + parser.add_argument('--limit', help='Limit news topics if this parameter provided', type=int) + parser.add_argument('--date', help='Show cached news by input date', type=str) + parser.add_argument('--to-pdf', help='Convert news to pdf format', action='store_true') + parser.add_argument('--to-html', help='Convert news to html format', action='store_true') + parser.add_argument('--colorize', help='Colorize output text', action='store_true') + + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/rss_reader/date_validation.py b/rss_reader/date_validation.py new file mode 100644 index 0000000..951e0a0 --- /dev/null +++ b/rss_reader/date_validation.py @@ -0,0 +1,8 @@ +from datetime import datetime + +def is_valid_date(date, pattern): + try: + datetime.strptime(date, pattern) + return True + except ValueError: + return False \ No newline at end of file diff --git a/rss_reader/html_converter.py b/rss_reader/html_converter.py new file mode 100644 index 0000000..14ca24d --- /dev/null +++ b/rss_reader/html_converter.py @@ -0,0 +1,35 @@ +"""HTML converter module""" + +import logging + + +class HTMLConverter: + def __init__(self, data, file_name): + self.data = data + self.file_name = file_name + + def dump(self): + """Create and fill HTML-file""" + logging.info("Create and fill HTML-file") + + html_code = f'\n\n{self.file_name}\n\n\n\n' + + for element in self.data: + html_code += f'

{element["title"]}

\n' + html_code += f'

{element["date"]}

\n' + html_code += f'

{element["link"]} \ +

\n' + html_code += f'

{element["text"]}

\n' + + for href in element["hrefs"]: + html_code += f'

\n' + + html_code += '\n' + + with open(f'{self.file_name}.html', 'w+', encoding='utf-8') as html_file: + html_file.write(html_code) diff --git a/rss_reader/json_formatter.py b/rss_reader/json_formatter.py new file mode 100644 index 0000000..f3df148 --- /dev/null +++ b/rss_reader/json_formatter.py @@ -0,0 +1,33 @@ +"""Module with tools for working with Json""" + +import json +import logging + + +class NewsJsonFormatter: + def __init__(self): + self.data = {} + + def __str__(self): + """Compute json-file for print news to console in json-format""" + logging.info('Compute json-file for print news to console in json-format') + + return json.dumps(self.data, ensure_ascii=False, indent=4) + + def write_to_file(self): + """Write json-data to file""" + logging.info('Write json-data to file') + + with open('data.json', 'w', encoding='utf-8') as outfile: + json.dump(self.data, outfile, ensure_ascii=False, indent=4) + + def format(self, news): + """Format data to json appereance""" + logging.info('Format data to json appereance') + + self.data = { + 'feed': [] + } + + for element in news: + self.data['feed'].append(element) diff --git a/rss_reader/news_cacher.py b/rss_reader/news_cacher.py new file mode 100644 index 0000000..18e29b6 --- /dev/null +++ b/rss_reader/news_cacher.py @@ -0,0 +1,60 @@ +"""News cacher module""" + +import logging +import json + +from datetime import datetime +from os.path import abspath, getsize, exists + +from .json_formatter import NewsJsonFormatter + + +class NewsCacher: + def __init__(self, file_name, source): + self.file_name = file_name + self.source = source + self.date = datetime.now().strftime('%Y%m%d') + self.data = None + + def get_cached_news(self, date, limit): + """Get cached news from json file""" + logging.info("Get cached news from json file") + + with open(self.file_name, 'r', encoding='utf-8') as json_file: + self.data = json.load(json_file) + + try: + return self.data[self.source][date][:limit] + except (KeyError, TypeError): + raise ValueError + + def cache(self, json_object): + """Cache news to json file""" + logging.info("Cache news to json file") + + self.data = { + self.source: { + self.date: [] + } + } + + if not exists(abspath(self.file_name)): + open(self.file_name, 'w').close() + + file_size = getsize(abspath(self.file_name)) + + if file_size != 0: + with open(self.file_name, 'r', encoding='utf-8') as json_file: + self.data = json.load(json_file) + + if self.source not in self.data: + self.data.update({self.source: {}}) + if self.date not in self.data[self.source]: + self.data[self.source].update({self.date: []}) + + for element in json_object: + if element not in self.data[self.source][self.date]: + self.data[self.source][self.date].append(element) + + with open(self.file_name, 'w', encoding='utf-8') as outfile: + json.dump(self.data, outfile, ensure_ascii=False, indent=4) diff --git a/rss_reader/pdf_converter.py b/rss_reader/pdf_converter.py new file mode 100644 index 0000000..e6d2dc4 --- /dev/null +++ b/rss_reader/pdf_converter.py @@ -0,0 +1,67 @@ +"""PDF converter module""" + +import logging +import requests + +from fpdf import SYSTEM_TTFONTS, FPDF + +SYSTEM_TTFONTS = '' + + +class PDFConverter: + def __init__(self, data, file_name): + self.data = data + self.file_name = file_name + self.pdf = FPDF(orientation='P', unit='mm', format='A4') + + def dump(self): + """Create and fill PDF-file""" + logging.info("Create and fill PDF-file") + + margin = 5 + + self.pdf.add_page() + self.pdf.set_auto_page_break(True, 10 * margin) + + for element in self.data: + self.pdf.add_font('FreeSans', '', 'FreeSans.ttf', uni=True) + self.pdf.set_fill_color(62, 147, 96) + + self.pdf.set_text_color(255, 255, 255) + self.pdf.set_font("FreeSans", size=14) + self.pdf.multi_cell(0, 8, txt=element["title"], align="C", fill=1) + self.pdf.set_fill_color(90, 167, 120) + self.pdf.set_font("FreeSans", size=10) + self.pdf.multi_cell(0, 5, txt=element["date"], align="R", fill=1) + self.pdf.set_font("FreeSans", size=10) + self.pdf.multi_cell(0, 5, txt=element["link"], align="R", fill=1) + + self.pdf.set_fill_color(229, 229, 229) + self.pdf.set_text_color(0, 0, 0) + self.pdf.set_font("FreeSans", size=12) + self.pdf.multi_cell(0, 6, txt=element["text"], align="J", fill=1) + + self.pdf.set_fill_color(242, 242, 242) + self.pdf.set_text_color(0, 0, 0) + self.pdf.set_font("FreeSans", size=10) + + self.pdf.ln(margin) + + for href in element["hrefs"]: + page_height = 300 + image_height = 50 + + try: + if page_height - self.pdf.get_y() < image_height + margin: + self.pdf.add_page() + + if href[-4:] == '.png': + self.pdf.image(href, x=self.pdf.get_x() + image_height + margin, y=self.pdf.get_y(), + h=image_height) + self.pdf.ln(image_height + margin) + except Exception: + logging.error('Cant get an image from url') + + self.pdf.multi_cell(0, margin, txt="", align="J") + + self.pdf.output(f'{self.file_name}.pdf') diff --git a/rss_reader/rss_reader.py b/rss_reader/rss_reader.py new file mode 100644 index 0000000..e0eb8aa --- /dev/null +++ b/rss_reader/rss_reader.py @@ -0,0 +1,181 @@ +"""RSS-reader module""" + +import re +import logging +import feedparser +import lxml.html +import lxml.html.clean + +from tldextract import extract +from colorama import Fore, Back, Style + +from .news_cacher import NewsCacher +from .json_formatter import NewsJsonFormatter +from .pdf_converter import PDFConverter +from .html_converter import HTMLConverter +from .date_validation import is_valid_date + + +class NewsReader: + def __init__(self, link, limit, json, date, convert_to_pdf, convert_to_html, colorize): + self.link = link + self.limit = limit + self.json = json + self.hrefs = [] + self.news = [] + self.date = date + self.json_object = NewsJsonFormatter() + self.convert_to_pdf = convert_to_pdf + self.convert_to_html = convert_to_html + self.colorize = colorize + + ext_site_name = extract(self.link) + site_name = f'{ext_site_name.domain}.{ext_site_name.suffix}' + + self.cacher_object = NewsCacher('cached_news.json', site_name) + + self.DATE_KEYS = ['published', 'updated', 'pubDate'] + self.MEDIA_KEYS = ['media_thumbnail', 'media_content'] + + def parse_url(self): + """Get RSS xml-file from url""" + logging.info('Get RSS XML-file from url') + + self.feed = feedparser.parse(self.link) + try: + self.parse_xml(self.feed.entries[:self.limit]) + except lxml.etree.ParserError: + logging.error('Unable to parse url') + exit() + + def parse_xml(self, source): + """Parse xml-file to news array""" + logging.info('Parse XML-file to news array') + + for item in source: + content = [] + pub_date = '' + + for key in self.DATE_KEYS: + if key in item: + pub_date = key + + for key in self.MEDIA_KEYS: + if key in item: + for element in item[key]: + content.append(element['url']) + + self.news.append({"title": self.clean_html_text(item.title), "date": item[pub_date], + "text": self.clean_html_text(self.strip_html_string(item.description)), + "link": item.link.split('?')[0], "hrefs": content}) + + if self.date is None: + self.cacher_object.cache(self.news) + else: + if not is_valid_date(self.date, '%Y%m%d'): + logging.error("Unexpected date format") + exit() + + if self.json is True: + self.json_object.format(self.news) + + if self.convert_to_html is True: + if self.date is not None: + try: + news = self.cacher_object.get_cached_news(self.date, self.limit) + except ValueError: + logging.error("News for this date not found") + exit() + except FileNotFoundError: + logging.error("Cache file not found") + exit() + + html = HTMLConverter(news, 'rss_reader_news') + else: + html = HTMLConverter(self.news, 'rss_reader_news') + + html.dump() + print('HTML file created in current directory') + + if self.convert_to_pdf is True: + if self.date is not None: + try: + news = self.cacher_object.get_cached_news(self.date, self.limit) + except ValueError: + logging.error("News for this date not found") + exit() + except FileNotFoundError: + logging.error("Cache file not found") + exit() + + pdf = PDFConverter(news, 'rss_reader_news') + else: + pdf = PDFConverter(self.news, 'rss_reader_news') + + pdf.dump() + print('PDF file created in current directory') + + def print_news(self): + """Print news to console""" + logging.info('Print news to console') + + if self.date is not None: + try: + news = self.cacher_object.get_cached_news(self.date, self.limit) + except ValueError: + logging.error("News for this date not found") + exit() + except FileNotFoundError: + logging.error("Cache file not found") + exit() + + if self.json is True: + self.json_object.format(news) + print(self.json_object) + else: + for element in news: + self.print_one_news(element) + print() + elif self.json is True: + print(self.json_object) + else: + for element in self.news: + self.print_one_news(element) + print() + + def print_one_news(self, element): + """Print one news to console""" + + if self.colorize is True: + print(f'{Fore.YELLOW + Back.BLACK + Style.BRIGHT}Title:{Fore.GREEN + Back.BLACK} {element["title"]}') + print(f'{Fore.YELLOW + Back.BLACK}Date: {Fore.RED + Back.BLACK} {element["date"]}') + print(f'{Fore.YELLOW + Back.BLACK}Link: {Fore.CYAN} {element["link"]}') + print(f'{Fore.YELLOW + Back.BLACK}News text:') + print(f'{Fore.WHITE + Back.BLACK}{element["text"]}') + print(f'{Fore.YELLOW + Back.BLACK}Hrefs:') + for href in element["hrefs"]: + print(f'{Fore.CYAN}| {href}') + else: + print(f'Title: {element["title"]}') + print(f'Date: {element["date"]}') + print(f'Link: {element["link"]}') + print('News text:') + print(element["text"]) + print('Hrefs:') + for href in element["hrefs"]: + print(f'| {href}') + + def strip_html_string(self, string): + """Remove html tags from a string""" + logging.info("Remove html tags from a string") + + strip_string = re.compile('<.*?>') + return re.sub(strip_string, '', string) + + def clean_html_text(self, string): + """Clean html string""" + + doc = lxml.html.fromstring(string) + cleaner = lxml.html.clean.Cleaner(style=True) + doc = cleaner.clean_html(doc) + return doc.text_content() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a9712df --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +import setuptools + +setuptools.setup( + name="rss-reader", + version="0.5", + author="Archeex", + author_email="qsanich@gmail.com", + description="Pure Python command-line RSS reader", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + entry_points={ + 'console_scripts': ['rss_reader = rss_reader.__main__:main'] + }, + python_requires='>=3.8' +) \ No newline at end of file diff --git a/tests/test_json_formatter.py b/tests/test_json_formatter.py new file mode 100644 index 0000000..cd82fa2 --- /dev/null +++ b/tests/test_json_formatter.py @@ -0,0 +1,10 @@ +import unittest +from rss_reader.json_formatter import NewsJsonFormatter + +class TestNewsReader(unittest.TestCase): + def test_format(self): + news = '{"Title": "hello"}' + self.assertEqual(NewsJsonFormatter.format(news), '{{"feed": {[{"Title": "hello"}]}}}') + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_rss_reader.py b/tests/test_rss_reader.py new file mode 100644 index 0000000..eac9414 --- /dev/null +++ b/tests/test_rss_reader.py @@ -0,0 +1,31 @@ +import unittest +from rss_reader.rss_reader import NewsReader + +class TestNewsReader(unittest.TestCase): + def test_parse_xml(self): + computed_xml = "[{'title': 'Министр внутренних дел: Мы работаем не на показатели, а на граждан', 'date': 'Sun, 01 Dec 2019 21:06:00 +0300', 'text': 'МВД меняет подходы к наркопреступлениям, ставка будет на профилактику и на выявление крупных поставщиков, заявил в эфире программы «Контуры» телеканала ОНТ министр внутренних дел Юрий Караев.', 'link': 'https://news.tut.by/society/663397.html', 'hrefs': ['https://img.tyt.by/n/buryakina/01/7/karaev_20190410_bur_tutby_phsl-9055.jpg']}]" + test_xml = """ +{ + 'title':'Министр внутренних дел: Мы работаем не на показатели, а на граждан', + 'link':'https://news.tut.by/society/663397.html?utm_campaign=news-feed&utm_medium=rss&utm_source=rss-news', + 'published':'Sun, 01 Dec 2019 21:06:00 +0300', + 'summary':'Фото: Дарья Бурякина, TUT.BYМВД меняет подходы к наркопреступлениям, ставка будет на профилактику и на выявление крупных поставщиков, заявил в эфире программы «Контуры» телеканала ОНТ министр внутренних дел Юрий Караев.
', + 'media_content':[ + { + 'url':'https://img.tyt.by/n/buryakina/01/7/karaev_20190410_bur_tutby_phsl-9055.jpg', + 'type':'image/jpeg', + 'medium':'image', + 'height':'800', + 'width':'1200', + 'filesize':'371255' + } + ] +} +""" + self.assertEqual(NewsReader.parse_xml(test_xml), computed_xml) + + def test_clean_html_text(self): + self.assertEqual(NewsReader.clean_html_text("Some String"), "SomeString") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file