introduction-to-python-bsuir-2019 · KirylDv · Nov 10, 2019 · Nov 10, 2019 · Nov 10, 2019 · Nov 12, 2019
diff --git a/README.md b/README.md
@@ -0,0 +1,6 @@
+#One-shot RSS reader
+
+1. --date       -     take date in format YYYYMMDD and return cached news with that publication date
+2. --to_fb2     -     convert output to fb2 format
+3. --to_html    -     convert output to html format
+4. --path       -     choose path for file saving mods
diff --git a/__init__.py b/__init__.py
diff --git a/project/SQL_cache.py b/project/SQL_cache.py
@@ -0,0 +1,92 @@
+import sqlite3
+from os.path import exists
+import sys
+from .log_helper import stdout_write, write_progressbar
+
+
+class Database():
+    """Class working with SQLite3 database"""
+
+    def __init__(self):
+        super(Database, self).__init__()
+        if not exists("cache.db"):
+            conn = sqlite3.connect("cache.db")
+            cursor = conn.cursor()
+            cursor.execute("""
+                CREATE TABLE `feed` (`source` text unique, `name` text)
+                """)
+            cursor.execute("""
+                CREATE TABLE "news" ( `source` text, `date` text, 
+                `title` text, `link` text UNIQUE, 
+                `description` text, `links` text )
+                """)
+            conn.commit()
+            conn.close()
+        self.conn = None
+        self.cursor = None
+
+    def _open(self):
+        self.conn = sqlite3.connect("cache.db")
+        self.cursor = self.conn.cursor()
+
+    def _close(self):
+        self.conn.close()
+
+    def write_data(self, data, feed, url, verbose, color):
+        """Write news to database
+        Params:
+        data: turple - article data
+        feed: str - rss_channel feed 
+        url: str
+        verbose: bool
+        """
+        try:
+            self._open()
+            counter = 0
+            if verbose:
+                write_progressbar(len(data)+1, counter)
+            for news in data:
+                self.cursor.execute(""" 
+                    INSERT INTO news
+                    VALUES (?,?,?,?,?,?) 
+                    """, news)
+                counter += 1
+                if verbose:
+                    write_progressbar(len(data)+1, counter)
+            self.conn.commit()
+            self.cursor.execute("""
+                INSERT INTO feed
+                VALUES (?,?)
+                """, (url, feed))
+            self.conn.commit()
+        except sqlite3.IntegrityError:
+            pass
+        except sqlite3.DatabaseError:
+            stdout_write("Database error", color="red", colorize=color)
+        finally:
+            self._close()
+            counter = len(data)+1
+            if verbose:
+                write_progressbar(len(data)+1, counter)
+
+    def read_data(self, url, date, color):
+        """Get url & date
+        Return feed & data
+        """
+        feed, data = None, None
+        try:
+            self._open()
+            self.cursor.execute(f"""
+                SELECT name from feed WHERE source = '{url}'
+                """)
+            feed = self.cursor.fetchall()
+            self.cursor.execute(f"""
+                SELECT * from news WHERE source = '{url}' and date = '{date}'
+                """)
+            data = self.cursor.fetchall()
+        except Exception as e:
+            stdout_write(f"Database reading error {e}", color="red", colorize=color)
+            sys.exit()
+        finally:
+            self._close()
+        return feed, data
diff --git a/project/__init__.py b/project/__init__.py
diff --git a/project/converter.py b/project/converter.py
@@ -0,0 +1,217 @@
+from .log_helper import stdout_write, write_progressbar
+from random import randint
+from time import time
+from base64 import b64encode
+import os
+import urllib.request
+import urllib.error
+
+
+def _download_image(url, verbose, sv_path, color=False):
+    """download image from Internet to your PC"""
+    stdout_write("Downloading image", verbose=verbose, color="blue", colorize=color)
+    try:
+        local_name, headers = urllib.request.urlretrieve(
+            url, sv_path + '/' + url.split('/')[-1])
+        stdout_write(f'Image "{url}" was downloaded.', verbose=verbose, color="green", colorize=color)
+        return local_name
+    except (urllib.error.URLError, urllib.error.HTTPError):
+        stdout_write("Error occurred during downloading image", color="red", colorize=color)
+        return ""
+    except ValueError:
+        stdout_write("Error: image not found", color="red", colorize=color)
+        return ""
+
+
+class Converter():
+    """Converter class. Convert data to some format"""
+
+    def to_json(self, feed, column, verbose, color):
+        """Take data and return it in json"""
+        stdout_write("Convert to json...", verbose=verbose, color="blue", colorize=color)
+        counter = 0
+        if verbose:
+            write_progressbar(len(column), counter)
+        json_text = '{\n  "title": "' + feed + '",\n  "news": ['
+        separ = False
+        for news in column:
+            if separ:
+                json_text += ','
+            separ = True
+            json_text += '{\n      "title": "' + news['title'] + '",'
+            if 'date' in news:
+                json_text += '\n      "date": "' + news['date'] + '",'
+            json_text += '\n      "link": "' + news['link'] + '",'
+            json_text += '\n      "description": "' + (news['text']) + '",'
+            json_text += '\n      "links": ['
+            links = ""
+            for lin in news['links']:
+                links += f'\n        "{lin}",'
+            if len(links) != 0:
+                json_text += links[:-1] + "\n      ]"
+            else:
+                json_text += ']'
+            json_text += "\n    }"
+            counter += 1
+            if verbose:
+                write_progressbar(len(column), counter)
+        json_text += ']\n}'
+        return json_text
+
+    def to_fb2(self, feed, column, url, sv_path=os.getcwd(), verbose=False, color=False):
+        """Function convert data to fb2 and save as file
+        Params:
+        feed - rss_channel feed
+        column - data from rss_channel
+        sv_path - path for html doc
+        url - link to source
+        """
+        def next_article(id, title, images, description, feed, date="Unknown"):
+            """return code for single article and 
+                      binary files for used images
+            """
+            stdout_write("Converting an article...", verbose=verbose, color="blue", colorize=color)
+            binary = []
+            for img in images:
+                binary += [f'<binary id="{hash(img)}.jpg" content-type="image/jpeg">{img}</binary>']
+            return f"""        <section id="{id}">
+            <title>
+                <p>{title}</p>
+            </title>
+            {' '.join([f'<image l:href="#{hash(img)}.jpg"/>' for img in images])}  
+            <p>{date}</p>
+            <p>{description}</p>
+            <p>Source: {feed}</p>
+        </section>
+""", binary
+
+        stdout_write("Creating FB2 file", verbose=verbose, color="blue", colorize=color)
+        fb2_begin = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
+            '<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"' + \
+            '\n  xmlns:l="http://www.w3.org/1999/xlink">'
+        fb2_end = '</FictionBook>'
+        fb2_desc = f"""
+    <description>
+        <title-info> 
+            <genre>sci_business/genre>
+            <author>
+                <nickname>{url}</nickname>
+            </author>
+            <book-title>{feed}</book-title>
+            <lang>en</lang>
+        </title-info>
+        <document-info>
+            <author>
+                <nickname>{url}</nickname>
+            </author>
+            <date value="2011-11-11">11.11.2011</date>
+            <version>3.14</version>
+            <id>{hash(time()+randint(10000000, 1000000000000))}</id>
+        </document-info>
+    </description>
+    <body>
+"""
+        binary = []
+        fb2_text = fb2_begin + fb2_desc
+
+        stdout_write("Convert news", verbose=verbose, color="blue", colorize=color)
+        for news in column:
+            image_links = []
+            text_links = []
+            for link in news["links"]:
+                if "(image)" in link:
+                    image_links += [link[:-8]]
+                else:
+                    text_links += [link[:-7]]
+            images = []
+            for link in image_links:
+                img_path = _download_image(link, verbose, sv_path, color)
+                try:
+                    with open(img_path, 'rb') as binfile:
+                        images += [b64encode(binfile.read()).decode()]
+                except FileNotFoundError:
+                    pass
+            article, temp_bin = next_article(id=hash(hash(news["title"]) + randint(1, 10000)),
+                                             title=news["title"],
+                                             images=images,
+                                             date=news["date"],
+                                             description=news["text"] +
+                                             'links' + "\n".join(text_links),
+                                             feed=feed
+                                             )
+            fb2_text += article
+            binary += temp_bin
+        stdout_write("Text data converted", verbose=verbose, color="green", colorize=color)
+        binary = set(binary)
+        fb2_text += "   </body>"
+        for img in binary:
+            fb2_text += '\n'+img+'\n'
+        fb2_text += fb2_end
+        stdout_write("Add binary part", verbose=verbose, color="green", colorize=color)
+
+        file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.fb2"
+        open(file_path, 'a').close()
+        with open(file_path, "w") as file:
+            file.write(fb2_text)
+        stdout_write("FB2 document created", verbose=verbose, color="green", colorize=color)
+
+    def to_html(self, feed, column, sv_path=os.getcwd(), verbose=False, color=False):
+        """Function convert data to html and save as file
+        Params:
+        feed - rss_channel feed
+        column - data from rss_channel
+        sv_path - path for html doc
+        """
+
+        def next_article(title, images, description, feed, links, date="Unknown"):
+            """create html-code for single article"""
+            return f"""
+        <div>
+            <h3>{title}</h3>
+            {' '.join(f'<img src="{img}" alt="Not found">' for img in images)}
+            <p>{description}</p>
+            {' '.join(f'<a href="{link}">link </a>' for link in links)}
+            <p>Date: {date}</p>
+        </div>
+            """
+
+        def create_html(feed, main_part):
+            return f"""
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>{feed}</title>
+    </head>
+    <body>
+{main_part}
+    </body>
+</html>
+"""
+
+        html_text = ""
+        stdout_write("Creating HTML version", verbose=verbose, color="blue", colorize=color)
+        for news in column:
+            image_links = []
+            text_links = []
+            for link in news["links"]:
+                if "(image)" in link:
+                    image_links += [link[:-8]]
+                else:
+                    text_links += [link[:-7]]
+            images = []
+            for link in image_links:
+                img_path = _download_image(link, verbose, sv_path, color)
+                images += [img_path]
+                html_text += next_article(links=text_links,
+                                          title=news["title"],
+                                          images=images,
+                                          date=news["date"],
+                                          description=news["text"],
+                                          feed=feed
+                                          )
+        html_text = create_html(feed, html_text)
+        file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.html"
+        open(file_path, 'a').close()
+        with open(file_path, "w") as file:
+            file.write(html_text)
+        stdout_write("Finish HTML document", verbose=verbose, color="green", colorize=color)
diff --git a/project/html_parser.py b/project/html_parser.py
@@ -0,0 +1,37 @@
+from html.parser import HTMLParser
+
+
+class _HTMLTagsParser(HTMLParser):
+    """Class using for parsing html-formatted text"""
+
+    def __init__(self):
+        super().__init__()
+        self.links = []
+        self.text = ""
+
+    def handle_starttag(self, tag, attrs):
+        """Convert <a> and <img> tags to text form"""
+        if tag == "img":
+            num = len(self.links)+1
+            self.text += "[Image"
+            for attr in attrs:
+                if attr[0] == "alt" and attr[1] != "":
+                    self.text += f": {attr[1]}"
+                elif attr[0] == "src":
+                    self.links += [attr[1] + " (image)"]
+            self.text += f"][{num}]"
+        elif tag == "a":
+            for attr in attrs:
+                if attr[0] == "href":
+                    self.links += [attr[1] + " (text)"]
+
+    def handle_data(self, data):
+        """Take text from HTML"""
+        self.text += data
+
+
+def parse_HTML(text):
+    """Return text without tags or links and a list with links"""
+    parser = _HTMLTagsParser()
+    parser.feed(text)
+    return parser.text, parser.links
diff --git a/project/log_helper.py b/project/log_helper.py
@@ -0,0 +1,37 @@
+def stdout_write(string, sep=' ', end='\n', flush=False, verbose=True, color="", colorize=False):
+    """Output function for singe string but convert &#39; to '"""
+    if colorize:
+        RED = '\033[31m'
+        BLUE = '\033[34m'
+        GREEN = '\033[92m'
+        RESET = '\033[0m'
+    else:
+        RED, BLUE, GREEN, RESET = "", "", "", ""
+
+    if color == "red":
+        color = RED
+    elif color == "blue":
+        color = BLUE
+    elif color == "green":
+        color = GREEN
+    else:
+        color, RESET = "", ""
+
+    if verbose:
+        string = string.replace("&#39;", "'")
+        print(color+string+RESET, sep=sep, end=end, flush=flush)
+
+
+def write_progressbar(elems, done, length=20):
+    """Take arguments
+    elems: count of elements
+    done: progress (in elements)
+    length: progress bar length
+    Write progress bar to stdout
+    """
+    if done != 0:
+        print("\r", end="")
+    col = int(length * (done/elems))
+    print(f"[{'='*col + ' '*(length-col)}] {int(100*done/elems)}%", end="")
+    if elems == done:
+        print()