diff --git a/.gitignore b/.gitignore index 894a44c..b437fb5 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,7 @@ celerybeat-schedule # Environments .env .venv +.vscode env/ venv/ ENV/ @@ -102,3 +103,15 @@ venv.bak/ # mypy .mypy_cache/ + +#logging +.idea +apps.log + +#file +News_feed.html + +.nginx +config +.vscode +.pytest_cache diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..098fb55 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.8 + +RUN mkdir /code + +WORKDIR /code +ADD . /code +ADD requirements.txt code/requirements.txt + +RUN python3.8 -m pip install --upgrade -r requirements.txt +ENV PYTHONPATH "${PATHONPATH}:/rss_reader_ft" \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..62a1f40 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include requirements.txt +include json_schema.json \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa5754a --- /dev/null +++ b/README.md @@ -0,0 +1,117 @@ +# RSS reader + +RSS reader is a command-line utility which receives RSS URL and prints results in human-readable format. + +## Specification + +Utility provides the following interface: + + positional arguments: + + source - RSS URL + + optional arguments: + + -h, --help - Show help message and exit. + + --version - Print version info. + + --json - Print result as JSON in stdout. + + --verbose - Outputs verbose status messages. + + --limit - Limit news topics if this parameter is provided. + + --date - Return cached news from the specified day. Format is %Y%M%D. **Shows the news of the day when you viewed them** + + --to-html - Print result as in HTML file + + --to-pdf - Print result as in PDF file + +## Install RSS reader v2.0 (work) +1. Create docker container: + ``` + docker run -it python /bin/bash + ``` +2. Clone or Download repository https://github.com/ZayJob/PythonHomework +3. Go to folder /PythonHomework +4. **git branch** +5. There is no branch besides **master**? Then follow this tutorial: + ``` + git branch --track finalTask remotes/origin/finalTask + git checkout finalTask + git checkout cfbdb81 + ``` + +6. I recommend creating a virtual environment. **python3.8 -m venv env**, **. env/bin/activate**. +7. Let's collect our package **python3.8 setup.py sdist**. +8. Let's install our package **pip3.8 install dist/rss_reader_ft-2.0.tar.gz** +9. Let's install req. **pip3.8 install -r requirements.txt**. +10. Use: + ``` + rss-reader "https://news.yahoo.com/rss/" --limit 1 + ``` + +## About my unittests ))) + +![](https://raw.githubusercontent.com/ZayJob/Telegram-bot/master/CryptoTower/6gGLqaT30sw.jpg) + +## Install RSS reader v5.0 (work) +1. Create docker container: + ``` + docker run -it -p 8080:8080 -v /var/run/docker.sock:/var/run/docker.sock python /bin/bash + ``` +2. Paste this commands into the console: + + ``` + git clone https://github.com/ZayJob/PythonHomework + + cd PythonHomework + + git branch --track finalTask remotes/origin/finalTask + git checkout finalTask + ``` + +2. Paste this command into the console: + + ``` + chmod +x install_script.sh && . install_script.sh + ``` + +3. Use for run app: + ``` + docker-compose run app python -m rss_reader_ft "https://news.yahoo.com/rss" --limit 2 --colorize --to-html --to-pdf --verbose + ``` +4. If you want to see the database, then open a browser and paste the URL ( http://localhost:8081/db/News_feed/feeds ) + +5. If you want to get and view the HTML or PDF file, execute the following commands: + ``` + docker ps -a + ``` + You will see the following: + ``` + CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + f2091f9472f4 pythonhomework_app "python -m rss_reade…" 2 minutes ago Exited (0) About a minute ago pythonhomework_app_run_4b117e008e87 + 9036216a4c28 mongo "docker-entrypoint.s…" 6 minutes ago Up 6 minutes 0.0.0.0:27017->27017/tcp pythonhomework_mongo_1 + 4fc0cef77ca2 mongo-express "tini -- /docker-ent…" 6 minutes ago Up 6 minutes 0.0.0.0:8081->8081/tcp pythonhomework_mongo-express_1 + b6288a095558 python "/bin/bash" 21 minutes ago Up 21 minutes interesting_chatterjee + + ``` + Сopy the name of the last running container and paste as in the *example* + ``` + template: docker export > latest.tar + + example: docker export pythonhomework_app_run_4b117e008e87 > latest.tar + ``` + And we get file: + ``` + tar -xf latest.tar code/News_feed.html + + ``` + or + ``` + + tar -xf latest.tar code/News_feed.pdf + ``` + Go to folder /code + +## Distribution +Utility is wrapped into package named rss_reader_ft. Additionally this package exports CLI utility named rss-reader. + +## Caching +The RSS news are stored in a local storage while reading. + +## Format converter +You should implement the conversion of news in at least two of the suggested format: .html, .pdf + +## Output colorization +You should add new optional argument --colorize, that will print the result of the utility in colorized mode \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..af45f77 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,32 @@ +version: '3.1' + +services: + mongo: + image: mongo + hostname: mongo + ports: + - 27017:27017 + depends_on: + - mongo-express + + app: + tty: true + container_name: rss_reader + stdin_open: true + depends_on: + - mongo + build: + context: . + dockerfile: Dockerfile + ports: + - 3000:3000 + links: + - mongo:mongodb + environment: + - MONGO_URL=mongodb://mongo:27017/ + + mongo-express: + image: mongo-express + restart: always + ports: + - 8081:8081 diff --git a/install_script.sh b/install_script.sh new file mode 100644 index 0000000..8e0c741 --- /dev/null +++ b/install_script.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +apt-get update && \ +apt-get -y install apt-transport-https \ + ca-certificates \ + curl \ + gnupg2 \ + software-properties-common && \ +curl -fsSL https://download.docker.com/linux/$(. /etc/os-release; echo "$ID")/gpg > /tmp/dkey; apt-key add /tmp/dkey && \ +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/$(. /etc/os-release; echo "$ID") \ + $(lsb_release -cs) \ + stable" && \ +apt-get update && \ +apt-get -y install docker-ce + +sudo curl -L https://github.com/docker/compose/releases/download/1.24.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose +sudo chmod +x /usr/local/bin/docker-compose + +docker-compose build + diff --git a/json_schema.json b/json_schema.json new file mode 100644 index 0000000..fe4e36e --- /dev/null +++ b/json_schema.json @@ -0,0 +1,126 @@ +{ + "type": "object", + "title": "The Root Schema", + "required": [ + "Feed", + "Url", + "News" + ], + "properties": { + "Feed": { + "$id": "#/properties/Feed", + "type": "string", + "title": "The Feed Schema", + "default": "", + "examples": [ + "Yahoo News - Latest News & Headlines" + ], + "pattern": "^(.*)$" + }, + "Url": { + "$id": "#/properties/Url", + "type": "string", + "title": "The Url Schema", + "default": "", + "examples": [ + "http://rss.news.yahoo.com/rss/" + ], + "pattern": "^(.*)$" + }, + "News": { + "$id": "#/properties/News", + "type": "array", + "title": "The News Schema", + "items": { + "$id": "#/properties/News/items", + "type": "object", + "title": "The Items Schema", + "required": [ + "Title", + "Date", + "Link", + "Description", + "Links" + ], + "properties": { + "Title": { + "$id": "#/properties/News/items/properties/Title", + "type": "string", + "title": "The Title Schema", + "default": "", + "examples": [ + "House Democrat warns Trump could be reelected despite impeachment inquiry" + ], + "pattern": "^(.*)$" + }, + "Date": { + "$id": "#/properties/News/items/properties/Date", + "type": "string", + "title": "The Date Schema", + "default": "", + "examples": [ + "Fri, 15 Nov 2019 18:07:20 -0500" + ], + "pattern": "^(.*)$" + }, + "Link": { + "$id": "#/properties/News/items/properties/Link", + "type": "string", + "title": "The Link Schema", + "default": "", + "examples": [ + "https://news.yahoo.com/house-democrat-warns-trump-could-be-reelected-despite-impeachment-inquiry-230720070.html" + ], + "pattern": "^(.*)$" + }, + "Description": { + "$id": "#/properties/News/items/properties/Description", + "type": "string", + "title": "The Description Schema", + "default": "", + "examples": [ + "Michigan Democrat Debbie Dingell thinks Trump could win reelection in 2020." + ], + "pattern": "^(.*)$" + }, + "Links": { + "$id": "#/properties/News/items/properties/Links", + "type": "object", + "title": "The Links Schema", + "required": [ + "Source_link", + "Img_links" + ], + "properties": { + "Source_link": { + "$id": "#/properties/News/items/properties/Links/properties/Source_link", + "type": "string", + "title": "The Source_link Schema", + "default": "", + "examples": [ + "https://news.yahoo.com/house-democrat-warns-trump-could-be-reelected-despite-impeachment-inquiry-230720070.html" + ], + "pattern": "^(.*)$" + }, + "Img_links": { + "$id": "#/properties/News/items/properties/Links/properties/Img_links", + "type": "array", + "title": "The Img_links Schema", + "items": { + "$id": "#/properties/News/items/properties/Links/properties/Img_links/items", + "type": "string", + "title": "The Items Schema", + "default": "", + "examples": [ + "http://l.yimg.com/uu/api/res/1.2/xLjRXbrMf4Uu4liSopZAig--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media-mbst-pub-ue1.s3.amazonaws.com/creatr-uploaded-images/2019-11/68be2920-07fb-11ea-baff-2733831db38e" + ], + "pattern": "^(.*)$" + } + } + } + } + } + } + } + } + } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..99f3c29 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +feedparser==5.2.1 +bs4==0.0.1 +pymongo==3.9.0 +colored==1.4.0 +reportlab==3.5.32 +requests==2.22.0 \ No newline at end of file diff --git a/rss_reader_ft/__init__.py b/rss_reader_ft/__init__.py new file mode 100755 index 0000000..dc71333 --- /dev/null +++ b/rss_reader_ft/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- +"""Сonnected modules""" +__all__ = ['app', 'conversion', 'rss', 'db', ] diff --git a/rss_reader_ft/__main__.py b/rss_reader_ft/__main__.py new file mode 100755 index 0000000..0e231d5 --- /dev/null +++ b/rss_reader_ft/__main__.py @@ -0,0 +1,10 @@ +"""Entry point for package""" +from rss_reader_ft import rss_reader + + +def main(): + rss_reader.main() + + +if __name__ == "__main__": + main() diff --git a/rss_reader_ft/app/__init__.py b/rss_reader_ft/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader_ft/app/application.py b/rss_reader_ft/app/application.py new file mode 100644 index 0000000..af65709 --- /dev/null +++ b/rss_reader_ft/app/application.py @@ -0,0 +1,56 @@ +"""Module contains objects related to logs""" +import sys +import logging +from typing import Dict, Any + +from rss_reader_ft.app.application_log import ApplicationLog +from rss_reader_ft.app.argument_parser import ArgumentParser +from rss_reader_ft.rss.data_loader import DataLoader +from rss_reader_ft.rss.output import Output +from rss_reader_ft.rss.rss_feed import RSSFeed +from rss_reader_ft.db.mongodb import MongoDatabase +from rss_reader_ft.db.mongodb_config import URL_CONNECTION, DB_NAME, COLLECTION_NAME + + +class Application: + """Application class""" + def __init__(self): + """Init Application class""" + self.dict_args = ArgumentParser.parse_args() + + def run_app(self) -> None: + """Мethod sets application behavior""" + + logging.info(f'args{self.dict_args}') + + mongo_db = MongoDatabase(URL_CONNECTION, DB_NAME, COLLECTION_NAME) + mongo_db.database_connection() + + data = DataLoader(self.dict_args['source']).upload() + + feed = RSSFeed(self.dict_args, data) + + process_data = feed.data_processing() + + mongo_db.cache_news_feed(process_data) + + news = mongo_db.get_news(self.dict_args["limit"], self.dict_args["date"], self.dict_args["source"]) + + if news is not None: + if self.dict_args["json"]: + Output.to_json_format(news) + else: + if self.dict_args["colorize"]: + Output.to_rss_format_colored(news) + else: + Output.to_rss_format(news) + + if self.dict_args["to_html"]: + Output.to_html_format(news) + + if self.dict_args["to_pdf"]: + Output.to_pdf_format(news) + + if self.dict_args["verbose"]: + ApplicationLog.print_log() + sys.exit(1) diff --git a/rss_reader_ft/app/application_log.py b/rss_reader_ft/app/application_log.py new file mode 100644 index 0000000..6f3b541 --- /dev/null +++ b/rss_reader_ft/app/application_log.py @@ -0,0 +1,26 @@ +"""Module contains objects related to logs""" +import logging + + +class ApplicationLog: + """"ApplicationLog class""" + @staticmethod + def setup_logs() -> None: + """Method that sets logger configuration parameters""" + logging.basicConfig( + filename="apps.log", + filemode="w", + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO + ) + logging.info('Create base config for log') + + @staticmethod + def print_log() -> None: + """Method displays logs to the console""" + logging.info('Print the logs from the apps.log file to stdout') + + with open("apps.log", "r") as file_handler: + for line in file_handler: + print(line) diff --git a/rss_reader_ft/app/argument_parser.py b/rss_reader_ft/app/argument_parser.py new file mode 100644 index 0000000..b6eea04 --- /dev/null +++ b/rss_reader_ft/app/argument_parser.py @@ -0,0 +1,82 @@ +"""Module contains objects related to arguments parsing""" +import logging +from typing import Dict, Any +import argparse + +from rss_reader_ft.config import __version__ + + +class ArgumentParser: + """ArgumentParser class""" + + @staticmethod + def parse_args() -> Dict[str, Any]: + """ + The method in which we set the parser, + indicate what objects we expect from it, + and return the dictionary + """ + parser = ErrorCatchingArgumentParser(description='Python command-line RSS reader.') + parser.add_argument( + 'source', + help='Enter the link to the information portal(RSS url)', + type=str + ) + parser.add_argument( + '--version', + help='Print version info', + action='version', version=__version__ + ) + parser.add_argument( + '--json', + help='Print result as JSON in stdout', + action='store_true' + ) + parser.add_argument( + '--verbose', + help='Outputs verbose status messages', + action='store_true' + ) + parser.add_argument( + '--limit', + help='Limit news topics if this parameter is provided', + type=int + ) + parser.add_argument( + '--date', + help='The cashed news can be read with it. The new from the specified day will be printed out.\ + If the news are not found return an error.', + type=int + ) + parser.add_argument( + '--to-html', + help='Print result as HTML file', + action='store_true' + ) + parser.add_argument( + '--to-pdf', + help='Print result as PDF file', + action='store_true' + ) + parser.add_argument( + '--colorize', + help='Add color for print', + action='store_true' + ) + logging.info('Parsed arguments') + return vars(parser.parse_args()) + + +class ArgParserError(Exception): + """The exception class""" + + def __init__(self, message): + self.message = message + + +class ErrorCatchingArgumentParser(argparse.ArgumentParser): + """Overloading method error()""" + + def error(self, message) -> None: + + raise ArgParserError('Ooops.. Error))) check link and arguments (-h)') diff --git a/rss_reader_ft/config.py b/rss_reader_ft/config.py new file mode 100644 index 0000000..bc191c4 --- /dev/null +++ b/rss_reader_ft/config.py @@ -0,0 +1,3 @@ +"""This modules contains internal application configuration""" +__version__ = "5.0" +__package__ = "rss_reader_ft" diff --git a/rss_reader_ft/conversion/DejaVuSans.ttf b/rss_reader_ft/conversion/DejaVuSans.ttf new file mode 100644 index 0000000..e5f7eec Binary files /dev/null and b/rss_reader_ft/conversion/DejaVuSans.ttf differ diff --git a/rss_reader_ft/conversion/__init__.py b/rss_reader_ft/conversion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader_ft/conversion/format_converter.py b/rss_reader_ft/conversion/format_converter.py new file mode 100644 index 0000000..47c6b7d --- /dev/null +++ b/rss_reader_ft/conversion/format_converter.py @@ -0,0 +1,9 @@ +"""Module contains objects related to converted""" +from abc import ABC, abstractmethod + + +class FormatConverter(ABC): + """FormatConverter abstract class""" + @abstractmethod + def convert_to_format(self): + pass diff --git a/rss_reader_ft/conversion/html_converter.py b/rss_reader_ft/conversion/html_converter.py new file mode 100644 index 0000000..7f593cf --- /dev/null +++ b/rss_reader_ft/conversion/html_converter.py @@ -0,0 +1,45 @@ +"""Module contains objects related to HTML""" +import logging +from typing import Dict, Any + +from rss_reader_ft.conversion.format_converter import FormatConverter + + +class HtmlConverter(FormatConverter): + """ + HtmlConverter class + inherited from FormatConverter abstract class. + """ + def __init__(self, rss_feed_dict: Dict[str, Any]): + """Init HtmlConverter class""" + self.convert_data = rss_feed_dict + + def convert_to_format(self) -> str: + """Сonversion method to HTML format""" + logging.info('Convert data to HTML and return it') + + html: str = """ + + + + + News feed + + + + """ + + html += f'' + + for entry in self.convert_data["News"]: + html += '' + + html += '

{self.convert_data["Feed"]} Ссылка

' + html += f'\n

{entry["Title"]}

' + html += f'

Ссылка на статью

' + for count, img_link in enumerate(entry["Links"]["Img_links"]): + html += f'

' + html += f'

Date: {entry["Date"]}

' + html += f'

{entry["Description"]}\n

' + html += '
' + return html diff --git a/rss_reader_ft/conversion/json_converter.py b/rss_reader_ft/conversion/json_converter.py new file mode 100644 index 0000000..b45782b --- /dev/null +++ b/rss_reader_ft/conversion/json_converter.py @@ -0,0 +1,22 @@ +"""Module contains objects related to JSON""" +import json +import logging +from typing import Dict, Any + +from rss_reader_ft.conversion.format_converter import FormatConverter + + +class JsonConverter(FormatConverter): + """ + JsonConverter class + inherited from FormatConverter abstract class. + """ + def __init__(self, rss_feed_dict: Dict[str, Any]): + """Init JsonConverter class""" + self.convert_data = rss_feed_dict + + def convert_to_format(self) -> str: + """Сonversion method to JSON format""" + logging.info('Convert data to JSON and return it') + + return json.dumps(self.convert_data, indent=4, ensure_ascii=False) diff --git a/rss_reader_ft/conversion/pdf_converter.py b/rss_reader_ft/conversion/pdf_converter.py new file mode 100644 index 0000000..7956fce --- /dev/null +++ b/rss_reader_ft/conversion/pdf_converter.py @@ -0,0 +1,118 @@ +"""Module contains objects related to PDF""" +import logging +import os +from typing import Dict, Any + +import requests +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfgen import canvas +from reportlab.lib.pagesizes import letter, landscape +from reportlab.pdfbase import pdfmetrics + +from rss_reader_ft.conversion.format_converter import FormatConverter + + +class PdfConverter(FormatConverter): + """ + JsonConverter class + inherited from FormatConverter abstract class. + """ + def __init__(self, rss_feed_dict: Dict[str, Any]): + """Init PdfConverter class""" + self.convert_data = rss_feed_dict + + def convert_to_format(self) -> None: + """Сonversion method to PDF format""" + logging.info('Convert data to PDF and return it') + + file_name = "News_feed.pdf" + self._generate_pdf_file(file_name) + + def _generate_pdf_file(self, file_name) -> None: + logging.info('Generate_pdf_file') + """ + Method to generate PDF file + """ + + pdf_file = canvas.Canvas(file_name, landscape(letter)) + pdfmetrics.registerFont(TTFont('DVS', 'DejaVuSans.ttf')) + + pdf_file.setFont('DVS', 24, leading=None) + pdf_file.drawCentredString(415, 250, self.convert_data['Feed']) + + pdf_file.setFont('DVS', 24, leading=None) + pdf_file.drawCentredString(415, 200, self.convert_data['Url']) + pdf_file.showPage() + + for count, news in enumerate(self.convert_data['News']): + y = self._edit_text(pdf_file, 500, news['Title']) + + for img in news["Links"]['Img_links']: + req = requests.get(img) + with open(f"img{count}.jpg", "w+b") as wf: + wf.write(req.content) + y -= 210 + pdf_file.drawImage(f"img{count}.jpg", 300, y, width=200, height=200) + + os.remove(f"img{count}.jpg") + + y -= 30 + pdf_file.setFont('DVS', 18, leading=None) + pdf_file.drawCentredString(415, y, news['Date']) + + y -= 30 + y = self._edit_link(pdf_file, y, news['Link']) + + y -= 30 + self._edit_text(pdf_file, y, news['Description']) + + pdf_file.showPage() + + pdf_file.save() + + @staticmethod + def _edit_link(pdf_file, y: int, string: str) -> int: + """A method that breaks a link to 2 honor""" + if len(string) >= 80: + link_start = string[:50] + link_end = string[50:] + + pdf_file.setFont('DVS', 14, leading=None) + pdf_file.drawCentredString(415, y, link_start) + y -= 13 + + pdf_file.setFont('DVS', 14, leading=None) + pdf_file.drawCentredString(415, y, link_end) + else: + pdf_file.setFont('DVS', 14, leading=None) + pdf_file.drawCentredString(415, y, string) + + return y + + @staticmethod + def _edit_text(pdf_file, y: int, string: str) -> int: + """ + A method that breaks text into words + and gathers lines of a certain length + from them so that the text does not go beyond the page + """ + if len(string) >= 70: + words = string.split(" ") + line = '' + for word in words: + if len(line) < 70: + line = " ".join([line, word]) + if len(line) >= 70: + pdf_file.setFont('DVS', 16, leading=None) + pdf_file.drawCentredString(415, y, line) + y -= 13 + line = '' + else: + pdf_file.setFont('DVS', 16, leading=None) + pdf_file.drawCentredString(415, y, line) + y -= 13 + else: + pdf_file.setFont('DVS', 16, leading=None) + pdf_file.drawCentredString(415, y, string) + + return y diff --git a/rss_reader_ft/db/__init__.py b/rss_reader_ft/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader_ft/db/mongodb.py b/rss_reader_ft/db/mongodb.py new file mode 100644 index 0000000..bd75ca5 --- /dev/null +++ b/rss_reader_ft/db/mongodb.py @@ -0,0 +1,91 @@ +"""Module contains objects related to work database""" +import datetime +import logging +from typing import Dict, Any + +import pymongo + + +class MongoDatabase: + """MongoDatabase class""" + + def __init__(self, URL_CONNECTION: str, DB_NAME: str, COLLECTION_NAME: str): + """Init MongoDatabase class""" + self.url_connection = URL_CONNECTION + self.db_name = DB_NAME + self.collection_name = COLLECTION_NAME + + def database_connection(self) -> None: + """Method for connecting to the database""" + try: + client = pymongo.MongoClient(self.url_connection) + db = client[self.db_name] + self.feed_collection = db[self.collection_name] + except ConnectionError as ex: + logging.error('Error connection to database') + logging.info('Сonnect to the database') + + def _check_news_feed(self, data: Dict[str, Any]) -> bool: + """Method for checking if an object is in the database""" + logging.info('Check for data in the database') + return self.feed_collection.find( + {"Url": data["Url"], "Date_Parsed": data["Date_Parsed"]}).count() == 0 + + def _update_news_feed(self, new_news_feed: Dict[str, Any]) -> None: + """ + Method for updating old news in the database + when parsing a news feed again + """ + logging.info('Updating old news') + + old_news_feed = self.feed_collection.find_one( + {"Url": new_news_feed["Url"], "Date_Parsed": new_news_feed["Date_Parsed"]}) + + news_update = [] + + for new_news in new_news_feed["News"]: + if new_news not in old_news_feed["News"]: + news_update.append(new_news) + + update_old_news_feed = old_news_feed["News"] + news_update + + self.feed_collection.update_one({"Url": old_news_feed["Url"], + "Date_Parsed": old_news_feed["Date_Parsed"]}, + {"$set": {"News": update_old_news_feed}}) + + def cache_news_feed(self, data: Dict[str, Any]) -> None: + """Method for caching data. + In which we determine whether the object exists and select an update or add + """ + if self._check_news_feed(data): + logging.info('Сached data') + self.feed_collection.insert_one(data) + else: + self._update_news_feed(data) + + def _check_date_in_database(self, date: int, source: str) -> bool: + logging.info('Date was found in db') + return self.feed_collection.find({"Url": source, "Date_Parsed": str(date)}).count() == 1 + + def get_news(self, limit: int, date: int, source: str) -> Dict[str, Any]: + """Method for finding news in a database and issuing them according to parameters""" + logging.info('We get data from the database') + + news_feed = None + if date is not None: + if self._check_date_in_database(date, source): + news_feed = self.feed_collection.find_one( + {"Url": source, "Date_Parsed": str(date)}) + else: + logging.info(f'Incorrect date') + print(f'Data not found in date - {date}') + else: + news_feed = self.feed_collection.find_one( + {"Url": source, "Date_Parsed": datetime.datetime.today().strftime("%Y%m%d")} + ) + + if limit is not None: + if 0 < limit <= len(news_feed["News"]): + news_feed["News"] = news_feed["News"][:limit] + + return news_feed diff --git a/rss_reader_ft/db/mongodb_config.py b/rss_reader_ft/db/mongodb_config.py new file mode 100644 index 0000000..73b90a4 --- /dev/null +++ b/rss_reader_ft/db/mongodb_config.py @@ -0,0 +1,5 @@ +"""This modules contains internal database configuration""" + +URL_CONNECTION = "mongodb://mongo:27017/" +DB_NAME = "News_feed" +COLLECTION_NAME = "feeds" diff --git a/rss_reader_ft/rss/__init__.py b/rss_reader_ft/rss/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader_ft/rss/data_loader.py b/rss_reader_ft/rss/data_loader.py new file mode 100644 index 0000000..e0ee544 --- /dev/null +++ b/rss_reader_ft/rss/data_loader.py @@ -0,0 +1,25 @@ +"""Module contains objects related to data loading""" +import logging +from typing import Dict, Any + +import feedparser + + +class DataLoader: + """DataLoader class""" + def __init__(self, url_source: str): + """Init DataLoader class""" + self.url_source = url_source + + def upload(self) -> Dict[str, Any]: + """Method of loading data from a site by URL""" + data = None + try: + data = feedparser.parse(self.url_source) + if data.bozo != 0: + print("Incorrect url") + raise ConnectionError("Not work connection") + except Exception as ex: + logging.error(f'Error connection {ex}', exc_info=False) + logging.info('Get data by URL') + return data diff --git a/rss_reader_ft/rss/output.py b/rss_reader_ft/rss/output.py new file mode 100644 index 0000000..8505f7e --- /dev/null +++ b/rss_reader_ft/rss/output.py @@ -0,0 +1,73 @@ +"""Module contains objects related to printing data""" +import logging +from typing import Dict, Any + +from colored import fore, style, back + +from rss_reader_ft.conversion.json_converter import JsonConverter +from rss_reader_ft.conversion.html_converter import HtmlConverter +from rss_reader_ft.conversion.pdf_converter import PdfConverter + + +class Output: + """PrintData class""" + @staticmethod + def to_rss_format(rss_feed_dict: Dict[str, Any]) -> None: + """Output to the console""" + logging.info('Print RSS feed') + + print(f'Feed: {rss_feed_dict["Feed"]}') + for entry in rss_feed_dict["News"]: + print(f'\nTitle: {entry["Title"]}') + print(f'Date: {entry["Date"]}') + print(f'Link: {entry["Link"]}\n') + print(f'{entry["Description"]}\n') + print(f'Links:\n[1] {entry["Links"]["Source_link"]} (link)') + + for count, img_link in enumerate(entry["Links"]["Img_links"]): + print(f'[{count + 2}] {img_link} (image)') # 2 this a shift + + @staticmethod + def to_rss_format_colored(rss_feed_dict: Dict[str, Any]) -> None: + """Output to the console with color""" + logging.info('Print RSS feed') + print(fore.GREEN + style.BOLD + f'Feed: {rss_feed_dict["Feed"]}' + style.RESET) + for entry in rss_feed_dict["News"]: + print(fore.LIGHT_BLUE + style.BOLD + '\nTitle: ' + style.RESET + style.BOLD + f'{entry["Title"]}' + + style.RESET) + print(fore.LIGHT_RED + style.BOLD + 'Date: ' + style.RESET + style.BOLD + f'{entry["Date"]}' + + style.RESET) + print(fore.LIGHT_BLUE + style.BOLD + 'Link: ' + style.RESET + style.BOLD + f'{entry["Link"]}\n' + + style.RESET) + print(style.BOLD + f'{entry["Description"]}\n' + style.RESET) + print(fore.LIGHT_BLUE + f'Links:\n[1] {entry["Links"]["Source_link"]} (link)') + + for count, img_link in enumerate(entry["Links"]["Img_links"]): + print(f'[{count + 2}] {img_link} (image)' + style.RESET) # 2 this a shift + + @staticmethod + def to_json_format(rss_feed_dict: Dict[str, Any]) -> None: + """Output data to the console in JSON format""" + json_data = JsonConverter(rss_feed_dict).convert_to_format() + + logging.info('Print RSS feed in JSON format') + + print(json_data) + + @staticmethod + def to_html_format(rss_feed_dict: Dict[str, Any]) -> None: + """Output data to HTML file""" + html_data = HtmlConverter(rss_feed_dict).convert_to_format() + + logging.info('Print RSS feed in HTML file') + + with open('News_feed.html', 'w') as fw: + fw.write(html_data) + + @staticmethod + def to_pdf_format(rss_feed_dict: Dict[str, Any]) -> None: + """Output data to PDF file""" + + PdfConverter(rss_feed_dict).convert_to_format() + + logging.info('Print RSS feed in PDF file') diff --git a/rss_reader_ft/rss/rss_feed.py b/rss_reader_ft/rss/rss_feed.py new file mode 100644 index 0000000..d8845ad --- /dev/null +++ b/rss_reader_ft/rss/rss_feed.py @@ -0,0 +1,47 @@ +"""Module contains objects related to rss feed""" +import logging +import datetime +import time +from typing import Dict, Any + +from bs4 import BeautifulSoup + + +class RSSFeed: + """RSSFeed class""" + def __init__(self, dict_args: Dict[str, Any], data: Dict[str, Any]): + """Init RSSFeed class""" + self.rss_url = dict_args['source'] + self.rss_feed = data + self.news = [] + self.rss_feed_dict = {} + + def data_processing(self) -> Dict[str, Any]: + """ + Method for converting rss data to a dictionary + and correcting them, + as well as processing the limit parameter + """ + self.rss_feed_dict.update({ + "_id": int(time.time()), + "Feed": self.rss_feed.feed.title, + "Url": self.rss_url, + "Date_Parsed": datetime.datetime.today().strftime("%Y%m%d") + }) + + for entry in self.rss_feed.entries: + soup = BeautifulSoup(entry.summary, features="html.parser") + self.news.append({ + "Title": str(entry.title).replace("'", "'"), + "Date": entry.published, + "Link": entry.link, + "Description": soup.text, + "Links": { + "Source_link": entry.links[0]["href"], + "Img_links": [link.get("src") for link in soup.find_all("img") if link.get("src")] + } + }) + + self.rss_feed_dict.update({"News": self.news}) + logging.info('Data processing for further work with them') + return self.rss_feed_dict diff --git a/rss_reader_ft/rss_reader.py b/rss_reader_ft/rss_reader.py new file mode 100755 index 0000000..122de7c --- /dev/null +++ b/rss_reader_ft/rss_reader.py @@ -0,0 +1,15 @@ +"""Application entry point module""" +import logging + +from rss_reader_ft.app.application import Application +from rss_reader_ft.app.application_log import ApplicationLog + + +def main() -> None: + """The main entry point of the application""" + try: + ApplicationLog.setup_logs() + app = Application() + app.run_app() + except Exception as ex: + logging.error(f'Error {ex}. Close application.', exc_info=False) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..68f1ae0 --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +from setuptools import setup, find_packages + +from rss_reader_ft import config + +setup( + name=config.__package__, + version=config.__version__, + description="One-shot command-line RSS reader", + long_description="RSS reader should be a command-line utility which receives RSS URL \ + and prints results in human-readable format.", + author="Vlad Bubeniuk", + author_email="zaybyst@mail.ru", + packages=find_packages(), + python_requires='>=3.8', + url="https://github.com/ZayJob/PythonHomework/tree/finalTask", + install_requires=["bs4", "feedparser", "pymongo", "colored", "requests", "fpdf"], + entry_points={ + 'console_scripts': + ['rss-reader = rss_reader_ft.__main__:main'] + } +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__main__.py b/tests/__main__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/app_test/__init__.py b/tests/unit/app_test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/conversion_test/__init__.py b/tests/unit/conversion_test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/conversion_test/conversion_test.py b/tests/unit/conversion_test/conversion_test.py new file mode 100644 index 0000000..6b0dfce --- /dev/null +++ b/tests/unit/conversion_test/conversion_test.py @@ -0,0 +1,29 @@ +"""Tests for rss_reader_ft.conversion module""" + +import unittest + +from rss_reader_ft.conversion.json_converter import JsonConverter +from rss_reader_ft.conversion.html_converter import HtmlConverter +from tests.unit.conversion_test.data import JSON_STR, NEWS, HTML_STR + + +class RssParserTestCase(unittest.TestCase): + """Test cases for FormatConverter class""" + + def test__convert_to_format_json(self): + """Function convert_to_format test""" + self.assertEqual( + JsonConverter(NEWS).convert_to_format(), + JSON_STR + ) + + def test__convert_to_format_html(self): + """Function convert_to_format test""" + self.assertEqual( + HtmlConverter(NEWS).convert_to_format(), + HTML_STR + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/conversion_test/data.py b/tests/unit/conversion_test/data.py new file mode 100644 index 0000000..e54394c --- /dev/null +++ b/tests/unit/conversion_test/data.py @@ -0,0 +1,54 @@ +JSON_STR = """{ + "_id": 1574890005, + "Feed": "Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India", + "Url": "https://timesofindia.indiatimes.com/rssfeeds/3942695.cms", + "Date_Parsed": "20191128", + "News": [ + { + "Title": "Karnataka bypolls: Is DK Shivakumar Congress trump card for bagging Vokkaliga votes?", + "Date": "Wed, 27 Nov 2019 13:54:05 GMT", + "Link": "https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms", + "Description": "Having faced the ignominy of being jailed in an alleged money laundering case, but coming out on bail as a superstar, Congress star campaigner and Vokkaliga leader DK Shivakumar over the last 48 hours has begun his tour of the south Karnataka constituencies that are going to polls on December 5.", + "Links": { + "Source_link": "https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms", + "Img_links": [ + "https://timesofindia.indiatimes.com/photo/72263069.cms" + ] + } + } + ] +}""" + +NEWS = { + "_id": 1574890005, + "Feed": "Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India", + "Url": "https://timesofindia.indiatimes.com/rssfeeds/3942695.cms", + "Date_Parsed": "20191128", + "News": [ + { + "Title": "Karnataka bypolls: Is DK Shivakumar Congress trump card for bagging Vokkaliga votes?", + "Date": "Wed, 27 Nov 2019 13:54:05 GMT", + "Link": "https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms", + "Description": "Having faced the ignominy of being jailed in an alleged money laundering case, but coming out on bail as a superstar, Congress star campaigner and Vokkaliga leader DK Shivakumar over the last 48 hours has begun his tour of the south Karnataka constituencies that are going to polls on December 5.", + "Links": { + "Source_link": "https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms", + "Img_links": [ + "https://timesofindia.indiatimes.com/photo/72263069.cms" + ] + } + } + ] +} + +HTML_STR = """ + + + + + News feed + + + +

Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India Ссылка

+

Karnataka bypolls: Is DK Shivakumar Congress trump card for bagging Vokkaliga votes?

Ссылка на статью

Date: Wed, 27 Nov 2019 13:54:05 GMT

Having faced the ignominy of being jailed in an alleged money laundering case, but coming out on bail as a superstar, Congress star campaigner and Vokkaliga leader DK Shivakumar over the last 48 hours has begun his tour of the south Karnataka constituencies that are going to polls on December 5. +

""" \ No newline at end of file diff --git a/tests/unit/db_test/__init__.py b/tests/unit/db_test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/db_test/data.py b/tests/unit/db_test/data.py new file mode 100644 index 0000000..627b9b1 --- /dev/null +++ b/tests/unit/db_test/data.py @@ -0,0 +1,20 @@ +NEWS = { + "_id": 1574890005, + "Feed": "Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India", + "Url": "https://timesofindia.indiatimes.com/rssfeeds/3942695.cms", + "Date_Parsed": "20191128", + "News": [ + { + "Title": "Karnataka bypolls: Is DK Shivakumar Congress trump card for bagging Vokkaliga votes?", + "Date": "Wed, 27 Nov 2019 13:54:05 GMT", + "Link": "https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms", + "Description": "Having faced the ignominy of being jailed in an alleged money laundering case, but coming out on bail as a superstar, Congress star campaigner and Vokkaliga leader DK Shivakumar over the last 48 hours has begun his tour of the south Karnataka constituencies that are going to polls on December 5.", + "Links": { + "Source_link": "https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms", + "Img_links": [ + "https://timesofindia.indiatimes.com/photo/72263069.cms" + ] + } + } + ] +} diff --git a/tests/unit/db_test/db_test.py b/tests/unit/db_test/db_test.py new file mode 100644 index 0000000..51b956d --- /dev/null +++ b/tests/unit/db_test/db_test.py @@ -0,0 +1 @@ +"""Tests for rss_reader_ft.db module""" diff --git a/tests/unit/rss_test/__init__.py b/tests/unit/rss_test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/rss_test/data.py b/tests/unit/rss_test/data.py new file mode 100644 index 0000000..c03493a --- /dev/null +++ b/tests/unit/rss_test/data.py @@ -0,0 +1,6 @@ +import time +from xml.sax import SAXParseException + +RSS_PARSE_FROM_URL = {'feed': {'language': 'en-gb', 'title': 'Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India', 'title_detail': {'type': 'text/plain', 'language': 'en-US', 'base': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'value': 'Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://timesofindia.indiatimes.com/articlelist/3942695.cms'}, {'type': 'application/rss+xml', 'rel': 'self', 'href': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms'}], 'link': 'https://timesofindia.indiatimes.com/articlelist/3942695.cms', 'subtitle': 'Hubballi News. TOI brings the latest Hubballi news headlines about Hubballi crime, Hubballi education news, Hubballi real estate news, Hubballi politics and Live Updates on local Hubballi news from Times of India - Hubballi news section.', 'subtitle_detail': {'type': 'text/html', 'language': 'en-US', 'base': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'value': 'Hubballi News. TOI brings the latest Hubballi news headlines about Hubballi crime, Hubballi education news, Hubballi real estate news, Hubballi politics and Live Updates on local Hubballi news from Times of India - Hubballi news section.'}, 'updated': 'Wed, 27 Nov 2019 21:26:45 GMT', 'updated_parsed': time.struct_time(tm_year=2019, tm_mon=11, tm_mday=27, tm_hour=21, tm_min=26, tm_sec=45, tm_wday=2, tm_yday=331, tm_isdst=0), 'rights': 'Copyright:(C) 2019 Bennett Coleman & Co. Ltd, http://in.indiatimes.com/policyterms/1554651.cms', 'rights_detail': {'type': 'text/plain', 'language': 'en-US', 'base': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'value': 'Copyright:(C) 2019 Bennett Coleman & Co. Ltd, http://in.indiatimes.com/policyterms/1554651.cms'}, 'docs': 'http://syndication.indiatimes.com/', 'image': {'title': 'Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India', 'title_detail': {'type': 'text/plain', 'language': 'en-US', 'base': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'value': 'Hubballi News, Latest Hubballi News Headlines & Live Updates - Times of India'}, 'href': 'https://timesofindia.indiatimes.com/photo/507610.cms', 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://timesofindia.indiatimes.com/articlelist/3942695.cms'}], 'link': 'https://timesofindia.indiatimes.com/articlelist/3942695.cms'}}, 'entries': [{'title': 'Karnataka bypolls: Is DK Shivakumar Congress trump card for bagging Vokkaliga votes?', 'title_detail': {'type': 'text/plain', 'language': 'en-US', 'base': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'value': 'Karnataka bypolls: Is DK Shivakumar Congress trump card for bagging Vokkaliga votes?'}, 'summary': 'Having faced the ignominy of being jailed in an alleged money laundering case, but coming out on bail as a superstar, Congress star campaigner and Vokkaliga leader DK Shivakumar over the last 48 hours has begun his tour of the south Karnataka constituencies that are going to polls on December 5.', 'summary_detail': {'type': 'text/html', 'language': 'en-US', 'base': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'value': 'Having faced the ignominy of being jailed in an alleged money laundering case, but coming out on bail as a superstar, Congress star campaigner and Vokkaliga leader DK Shivakumar over the last 48 hours has begun his tour of the south Karnataka constituencies that are going to polls on December 5.'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms'}], 'link': 'https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms', 'id': 'https://timesofindia.indiatimes.com/city/hubballi/karnataka-bypolls-is-dk-shivakumar-congress-trump-card-for-bagging-vokkaliga-votes/articleshow/72263069.cms', 'guidislink': False, 'published': 'Wed, 27 Nov 2019 13:54:05 GMT', 'published_parsed': time.struct_time(tm_year=2019, tm_mon=11, tm_mday=27, tm_hour=13, tm_min=54, tm_sec=5, tm_wday=2, tm_yday=331, tm_isdst=0)}], 'bozo': 0, 'headers': {'Server': 'nginx', 'Content-Type': 'text/xml;charset=UTF-8', 'Content-Length': '17533', 'Vary': 'Accept-Encoding', 'Last-Modified': 'Wed, 27 Nov 2019 21:25:45 GMT', 'content-msg': 'DATA_SERVED_FROM_CACHE', 'Content-Language': 'en-US', 'Cache-Control': 'public, must-revalidate, max-age=777', 'Expires': 'Wed, 27 Nov 2019 21:45:45 GMT', 'Date': 'Wed, 27 Nov 2019 21:32:48 GMT', 'Connection': 'close', 'Access-Control-Max-Age': '86400', 'Access-Control-Allow-Credentials': 'false', 'Access-Control-Allow-Headers': 'Origin,X-Requested-With,Content-Type,Accept', 'Access-Control-Allow-Methods': 'GET,POST', 'Strict-Transport-Security': 'max-age=86400'}, 'updated': 'Wed, 27 Nov 2019 21:25:45 GMT', 'updated_parsed': time.struct_time(tm_year=2019, tm_mon=11, tm_mday=27, tm_hour=21, tm_min=25, tm_sec=45, tm_wday=2, tm_yday=331, tm_isdst=0), 'href': 'https://timesofindia.indiatimes.com/rssfeeds/3942695.cms', 'status': 200, 'encoding': 'UTF-8', 'version': 'rss20', 'namespaces': {'': 'http://www.w3.org/2005/Atom'}} + +RSS_INCORRECT_USL = {'feed': {}, 'entries': [], 'bozo': 1, 'encoding': 'utf-8', 'version': '', 'bozo_exception': SAXParseException('syntax error'), 'namespaces': {}} \ No newline at end of file diff --git a/tests/unit/rss_test/rss_test.py b/tests/unit/rss_test/rss_test.py new file mode 100644 index 0000000..157ee40 --- /dev/null +++ b/tests/unit/rss_test/rss_test.py @@ -0,0 +1 @@ +"""Tests for rss_reader_ft.rss module"""