introduction-to-python-bsuir-2019 · prague15031939 · Nov 10, 2019 · Nov 17, 2019 · Nov 17, 2019 · Nov 17, 2019
diff --git a/final_task/LICENSE b/final_task/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2019 The Python Packaging Authority
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/final_task/README.md b/final_task/README.md
@@ -0,0 +1,44 @@
+##### JSON structure
+
+```
+{
+  "news": {
+    "feed": "Yahoo News - Latest News & Headlines",
+    "publications": [
+      {
+        "title": "Stefanik embraces spotlight at impeachment hearings",
+        "pub_date": "Fri, 15 Nov 2019 17:55:51 -0500",
+        "link": "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html",
+        "description": "[image 2: Stefanik embraces spotlight at impeachment hearings] [2]\nThe second day of the impeachment inquiry\u2019s public hearings, on Friday, began the same way\nas the first: with an attempt by Rep. Elise Stefanik, a New York Republican, to interrupt proceedings\nwith a procedural objection.",
+        "hrefs": [
+          [
+            "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html",
+            "link"
+          ],
+          [
+            "http://l.yimg.com/uu/api/res/1.2/NRuDo56c6EiwjZH4WOqEZg--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media-mbst-pub-ue1.s3.amazonaws.com/creatr-uploaded-images/2019-11/7a1d0760-07d6-11ea-bef7-f17150574bb2",
+            "image",
+            "Stefanik embraces spotlight at impeachment hearings"
+          ]
+        ]
+      }
+    ]
+  }
+}
+```
+
+##### Cache description
+
+News received from feed is cached through database is being created locally.
+
+The database consists of the only file named "cache.db". It has the following structure:
+
+|     |  id  | feed | title | pub_date | pub_parsed | link | description | hrefs |
+|-----|------|------|-------|----------|------------|------|-------------|-------|
+|post |  ..  | ...  |  ...  |   ...    |    ...     | ...  |     ...     |   ... |   
+
+All fields except "id" have text type. ID field plays a role of post primary key.
+
+Hrefs field is composed of all post links including image links and image descriptions.
+Usual references section and one for image links are separated by --|-- sequence. 
+Items in one section are separated by -+- sequence. And -|- is for dividing link, it's type and image description.
diff --git a/final_task/requirements.txt b/final_task/requirements.txt
@@ -0,0 +1,2 @@
+feedparser
+bs4
diff --git a/final_task/rss_reader/__init__.py b/final_task/rss_reader/__init__.py
diff --git a/final_task/rss_reader/cacher.py b/final_task/rss_reader/cacher.py
@@ -0,0 +1,93 @@
+"""
+this module provides tools for caching news
+
+it includes functions for work with database and support ones
+"""
+
+import sqlite3
+from re import match
+
+def init_database():
+    """
+    this function creates and initizlizes database for caching news
+    """
+    connection_obj = sqlite3.connect('cache.db')
+    cursor_obj = connection_obj.cursor()
+    cursor_obj.execute(
+        '''CREATE TABLE IF NOT EXISTS cache (id integer primary key, feed text, title text, pub_date text, pub_parsed text, link text, description text, hrefs text)'''
+    )
+    connection_obj.commit()
+
+    return connection_obj, cursor_obj
+
+def cache_news(connection_obj, cursor_obj, news):
+    """
+    this function adds parsed news in database
+    """
+    for post in news:
+        cursor_obj.execute(
+            '''SELECT id FROM cache WHERE feed=? AND title=? AND pub_date=? AND pub_parsed=? AND link=? AND description=? AND hrefs=?''',
+            (post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs']))
+        )
+        if cursor_obj.fetchone() is None:
+            cursor_obj.execute(
+                '''INSERT INTO cache (feed, title, pub_date, pub_parsed, link, description, hrefs) VALUES (?, ?, ?, ?, ?, ?, ?)''',
+                (post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs']))
+            )
+    connection_obj.commit()
+
+    return
+
+def get_cached_news(cursor_obj, date):
+    """
+    this function fetches news from database and return them as a list
+    """
+    cursor_obj.execute('''SELECT * FROM cache WHERE pub_parsed=?''', (date, ))
+    rows = cursor_obj.fetchall()
+
+    news = []
+    for row in rows:
+        data = {}
+        data['feed'] = row[1]
+        data['title'] = row[2]
+        data['pub_date'] = row[3]
+        data['pub_parsed'] = row[4]
+        data['link'] = row[5]
+        data['description'] = row[6]
+
+        hrefs = row[7].split("--|--")
+        try:
+            data['hrefs'] = [tuple(item.split("-|-")) for item in hrefs[0].split("-+-") if item != '']
+            data['hrefs'] += [tuple(item.split("-|-")) for item in hrefs[1].split("-+-") if item != '']
+        except IndexError:
+            pass
+        news.append(data)
+
+    return news
+
+def hrefs_to_text(link_list):
+    """
+    this function represents the list of links connected to post to text form
+    """
+    res_line = ''
+    ind = -1
+    for tpl in link_list:
+        if tpl[1] != 'image':
+            res_line += f"-+-{tpl[0]}-|-{tpl[1]}"
+        else:
+            res_line += '--|--'
+            ind = link_list.index(tpl)
+            break
+
+    if ind != -1:
+        for tpl in link_list[ind:]:
+            res_line += f"{tpl[0]}-|-{tpl[1]}-|-{tpl[2]}-+-"
+
+    return res_line
+
+def is_valid_date(line):
+    """
+    this function checks a date parameter for suiting date format
+    """
+    date = r"^[1-2][0-9]{3}[0-1][0-9][0-3][0-9]$"
+    return match(date, line)
diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py
@@ -0,0 +1,201 @@
+"""
+main rss_reader module
+"""
+
+import sys
+import argparse
+import logging
+import html
+import json
+import feedparser
+from bs4 import BeautifulSoup
+import cacher
+
+def init_cli_parser():
+    """
+    this function initializes command line parser with all nessecary arguments
+    """
+    parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.', prog='rss-reader')
+    group = parser.add_mutually_exclusive_group(required = True)
+    group.add_argument("source", type=str, nargs='?', default=None, help="RSS URL")
+    parser.add_argument('--version', help="print version info", action='version', version='%(prog)s 1.3')
+    parser.add_argument("--json", help="print result as JSON in stdout", action="store_true")
+    parser.add_argument("--verbose", help="output verbose status messages", action="store_true")
+    group.add_argument("--date", type=str,  help="print news with provided publish date in stdout")
+    parser.add_argument("--limit", type=int, help="limit news topics if this parameter provided")
+
+    return parser.parse_args()
+
+def init_logger():
+    """
+    this function initizlizes logger connected with log file
+    """
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler("rss_reader_logs.txt")
+    file_handler.setFormatter(logging.Formatter('%(asctime)s -- %(levelname)s -- %(message)s'))
+    logger.addHandler(file_handler)
+
+    return logger
+
+def brush_text(line):
+    """
+    this function forms description text into more convinient form
+    """
+    start = 100
+    while True:
+        i = start - 10
+        try:
+            while line[i] != ' ':
+                i += 1
+        except IndexError:
+            break
+        line = line[:i] + "\n" + line[i + 1:]
+        start += 100
+
+    return line
+
+def get_post_content(post, feed_title):
+    """
+    this function fetches nessecary elements of a publication from post
+    """
+    data = {}
+    data['feed'] = feed_title
+    data['title'] = html.unescape(post.title)
+    data['pub_date'] = post.published
+    data['pub_parsed'] = f"{post.published_parsed.tm_year}{post.published_parsed.tm_mon}{post.published_parsed.tm_mday}"
+    data['link'] = post.link
+    soup = BeautifulSoup(post.description, 'html.parser')
+    data['description'] = brush_text(html.unescape(soup.text))
+    data['hrefs'] = [(link['href'], 'link') for link in soup.find_all('a') if link.get('href', None)]
+    for img in soup.find_all('img'):
+        if not img.get('src', 'Unknown') == '':
+            data['hrefs'] += [(img.get('src', 'Unknown'), 'image', img.get('alt', ''))]
+            data['description'] = \
+                f"[image {len(data['hrefs'])}: {img.get('alt', '')}] [{len(data['hrefs'])}]\n" + data['description']
+
+    return data
+
+def parse_news(url):
+    """
+    this function parses news by given url and returns news list and feed title
+    """
+    feed = feedparser.parse(url)
+    if feed.bozo == 1:
+        raise ValueError
+
+    news = []
+    for post in feed.entries:
+        news += [get_post_content(post, feed.feed.title)]
+
+    return news
+
+def display_news(news):
+    """
+    this function prints news in stdout
+    """
+    if len(news) == 0:
+        return
+
+    is_same_feed = all([news[0]['feed'] == item['feed'] for item in news])
+    if is_same_feed:
+        print(f"Feed: {news[0]['feed']}\n")
+
+    for item in news:
+        if not is_same_feed:
+            print(f"Feed: {item['feed']}\n")
+        print(f"Title: {item['title']}")
+        print(f"Publication date: {item['pub_date']}")
+        print(f"Link: {item['link']}\n")
+        print(f"{item['description']}\n")
+        print("Links:")
+        for index, tpl in enumerate(item['hrefs']):
+            print(f"[{index + 1}] {tpl[0]} ({tpl[1]})")
+        print('\n')
+
+    return
+
+def to_json(news):
+    """
+    this function represents news in json format
+    """
+    for ind, item in enumerate(news):
+        del item['pub_parsed']
+        news[ind] = item
+
+    return json.dumps({'news': news}, indent=2)
+
+def main():
+    """
+    an entry point for a program
+    """
+    logger = init_logger()
+    args = init_cli_parser()
+    connection, cursor = cacher.init_database()
+
+    if args.verbose:
+        logger.addHandler(logging.StreamHandler(sys.stdout))
+        logger.info(f"verbose notifications are turned on")
+
+    if args.limit:
+        if args.limit < 1:
+            if not args.verbose:
+                print("error: invalid limit value")
+            logger.error(f"invalid limit value")
+            logger.info(f"end of work -->|")
+            return
+
+    if args.date:
+        try:
+            logger.info(f"checking date..")
+            if not cacher.is_valid_date(args.date):
+                raise ValueError
+            logger.info(f"started fetching data from cache..")
+            news = cacher.get_cached_news(cursor, args.date)
+            if len(news) == 0:
+                raise IndexError
+            news = news[:args.limit if args.limit else len(news)]
+        except ValueError:
+            if not args.verbose:
+                print("error: invalid date")
+            logger.error(f"invalid date")
+            logger.info(f"end of work -->|")
+            return
+        except IndexError:
+            if not args.verbose:
+                print("no news for this date")
+            logger.info(f"no news for this date")
+            logger.info(f"end of work -->|")
+            return
+
+    if args.source:
+        logger.info(f"started fetching data (url - {args.source})..")
+        try:
+            news = parse_news(args.source)
+            logger.info(f"started caching data..")
+            cacher.cache_news(connection, cursor, news)
+            news = news[:args.limit if args.limit else len(news)]
+        except ValueError:
+            if not args.verbose:
+                print(f"error: not well-formed xml or broken access to the Internet")
+            logger.error(f"not well-formed xml or broken access to the Internet")
+            logger.info(f"end of work -->|")
+            return
+
+    if args.limit:
+        logger.info(f"the limit of publications to show - {args.limit}")
+
+    if not args.json:
+        logger.info(f"displaying news..\n")
+        display_news(news)
+    else:
+        logger.info(f"displaying news in json format..\n")
+        print(to_json(news))
+
+    logger.info(f"\npublications were successfully shown - {len(news)}")
+    logger.info(f"end of work -->|")
+
+    return
+
+if __name__ == "__main__":
+    main()
diff --git a/final_task/setup.py b/final_task/setup.py
@@ -0,0 +1,22 @@
+import setuptools
+
+with open("README.md", "r") as f:
+    long_description = f.read()
+
+setuptools.setup(
+    name="rss-reader",
+    version="1.2",
+    author="Anton Pashkevich",
+    author_email="mario.lazer@mail.ru",
+    description="Pure Python command-line RSS reader",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/prague15031939/PythonHomework",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.8',
+)