Skip to content

Dvorakouski Kiryl #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 35 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
f265ddb
Create rss_reader.py
KirylDv Nov 10, 2019
df71920
Delete rss_reader.py
KirylDv Nov 10, 2019
bb229b9
Create rss_reader.py
KirylDv Nov 10, 2019
aec5dd7
Add: arguments for parser
KirylDv Nov 12, 2019
7d412ef
Create file reader.py
KirylDv Nov 15, 2019
416704e
Add: parser
KirylDv Nov 15, 2019
e46c7cc
Add: console output for news
KirylDv Nov 15, 2019
48baada
Add: --verbose
KirylDv Nov 16, 2019
5a75dfb
Add: call --json function
KirylDv Nov 16, 2019
4617b7d
Add: --json
KirylDv Nov 16, 2019
751a4d0
Add shebang & correct codestyle
KirylDv Nov 16, 2019
3dcb76a
Fix: images without alt text
KirylDv Nov 16, 2019
cce26c1
1st Iteration
KirylDv Nov 16, 2019
4ddda5c
Add: setup.py
KirylDv Nov 17, 2019
92c78bf
2nd Iteration. Make changes for setup
KirylDv Nov 17, 2019
c2abc7c
Refactor convertion to json
KirylDv Nov 26, 2019
f0aa494
Add parts for cahce
KirylDv Nov 29, 2019
ca5588c
Add: database exists checking, change news from list fo dict
KirylDv Nov 29, 2019
5b0ba26
Add: working with cache methods
KirylDv Nov 29, 2019
578e9e4
Add require packages
KirylDv Nov 29, 2019
6be9bd3
Add forgotten ,
KirylDv Nov 29, 2019
ea6b17f
Add docstrings
KirylDv Nov 29, 2019
96dce68
3rd Iteration
KirylDv Nov 29, 2019
fa15607
Add: to_fb2 function
KirylDv Nov 30, 2019
69d093f
Correct fb2 doc
KirylDv Nov 30, 2019
6124b87
Now fb2 work with --path
KirylDv Nov 30, 2019
c68c60f
Add links to fb2
KirylDv Nov 30, 2019
677b7bf
Add: convertion to html
KirylDv Dec 1, 2019
b60845a
Add docstrings for html & fb2
KirylDv Dec 1, 2019
01fb821
Add: verbose
KirylDv Dec 1, 2019
fc84232
4th Iteration
KirylDv Dec 1, 2019
68ff764
Add colors
KirylDv Dec 1, 2019
c116d2e
5th Iteration
KirylDv Dec 1, 2019
330002b
5th Iteration. Strings of more than 120 characters removed
KirylDv Dec 1, 2019
2d5292c
Fix errors added with colorize
KirylDv Dec 10, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#One-shot RSS reader

1. --date - take date in format YYYYMMDD and return cached news with that publication date
2. --to_fb2 - convert output to fb2 format
3. --to_html - convert output to html format
4. --path - choose path for file saving mods
Empty file added __init__.py
Empty file.
92 changes: 92 additions & 0 deletions project/SQL_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import sqlite3
from os.path import exists
import sys
from .log_helper import stdout_write, write_progressbar


class Database():
"""Class working with SQLite3 database"""

def __init__(self):
super(Database, self).__init__()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В данном случае эта строка не нужна.

if not exists("cache.db"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Советую название файла с кэшем вынести в какую-нибудь переменную, к которой можно обратиться из всех мест, где оно необходимо. Например сделать ее отрибутом класса (либо сделать как аргумент функции init, в таком случае эта функциональность будет более кастомизируема.

conn = sqlite3.connect("cache.db")
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE `feed` (`source` text unique, `name` text)
""")
cursor.execute("""
CREATE TABLE "news" ( `source` text, `date` text,
`title` text, `link` text UNIQUE,
`description` text, `links` text )
""")
conn.commit()
conn.close()
self.conn = None
self.cursor = None

def _open(self):
self.conn = sqlite3.connect("cache.db")
self.cursor = self.conn.cursor()

def _close(self):
self.conn.close()

def write_data(self, data, feed, url, verbose, color):
"""Write news to database
Params:
data: turple - article data
feed: str - rss_channel feed
url: str
verbose: bool
"""
try:
self._open()
counter = 0
if verbose:
write_progressbar(len(data)+1, counter)
for news in data:
self.cursor.execute("""
INSERT INTO news
VALUES (?,?,?,?,?,?)
""", news)
counter += 1
if verbose:
write_progressbar(len(data)+1, counter)
self.conn.commit()
self.cursor.execute("""
INSERT INTO feed
VALUES (?,?)
""", (url, feed))
self.conn.commit()
except sqlite3.IntegrityError:
pass
except sqlite3.DatabaseError:
stdout_write("Database error", color="red", colorize=color)
finally:
self._close()
counter = len(data)+1
if verbose:
write_progressbar(len(data)+1, counter)

def read_data(self, url, date, color):
"""Get url & date
Return feed & data
"""
feed, data = None, None
try:
self._open()
self.cursor.execute(f"""
SELECT name from feed WHERE source = '{url}'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

""")
feed = self.cursor.fetchall()
self.cursor.execute(f"""
SELECT * from news WHERE source = '{url}' and date = '{date}'
""")
data = self.cursor.fetchall()
except Exception as e:
stdout_write(f"Database reading error {e}", color="red", colorize=color)
sys.exit()
finally:
self._close()
return feed, data
Empty file added project/__init__.py
Empty file.
217 changes: 217 additions & 0 deletions project/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
from .log_helper import stdout_write, write_progressbar
from random import randint
from time import time
from base64 import b64encode
import os
import urllib.request
import urllib.error


def _download_image(url, verbose, sv_path, color=False):
"""download image from Internet to your PC"""
stdout_write("Downloading image", verbose=verbose, color="blue", colorize=color)
try:
local_name, headers = urllib.request.urlretrieve(
url, sv_path + '/' + url.split('/')[-1])
stdout_write(f'Image "{url}" was downloaded.', verbose=verbose, color="green", colorize=color)
return local_name
except (urllib.error.URLError, urllib.error.HTTPError):
stdout_write("Error occurred during downloading image", color="red", colorize=color)
return ""
except ValueError:
stdout_write("Error: image not found", color="red", colorize=color)
return ""


class Converter():
"""Converter class. Convert data to some format"""

def to_json(self, feed, column, verbose, color):
"""Take data and return it in json"""
stdout_write("Convert to json...", verbose=verbose, color="blue", colorize=color)
counter = 0
if verbose:
write_progressbar(len(column), counter)
json_text = '{\n "title": "' + feed + '",\n "news": ['
separ = False
for news in column:
if separ:
json_text += ','
separ = True
json_text += '{\n "title": "' + news['title'] + '",'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Есть модуль json, в котором есть функциональность по конвертированию объектов Python в JSON представление.

if 'date' in news:
json_text += '\n "date": "' + news['date'] + '",'
json_text += '\n "link": "' + news['link'] + '",'
json_text += '\n "description": "' + (news['text']) + '",'
json_text += '\n "links": ['
links = ""
for lin in news['links']:
links += f'\n "{lin}",'
if len(links) != 0:
json_text += links[:-1] + "\n ]"
else:
json_text += ']'
json_text += "\n }"
counter += 1
if verbose:
write_progressbar(len(column), counter)
json_text += ']\n}'
return json_text

def to_fb2(self, feed, column, url, sv_path=os.getcwd(), verbose=False, color=False):
"""Function convert data to fb2 and save as file
Params:
feed - rss_channel feed
column - data from rss_channel
sv_path - path for html doc
url - link to source
"""
def next_article(id, title, images, description, feed, date="Unknown"):
"""return code for single article and
binary files for used images
"""
stdout_write("Converting an article...", verbose=verbose, color="blue", colorize=color)
binary = []
for img in images:
binary += [f'<binary id="{hash(img)}.jpg" content-type="image/jpeg">{img}</binary>']
return f""" <section id="{id}">
<title>
<p>{title}</p>
</title>
{' '.join([f'<image l:href="#{hash(img)}.jpg"/>' for img in images])}
<p>{date}</p>
<p>{description}</p>
<p>Source: {feed}</p>
</section>
""", binary

stdout_write("Creating FB2 file", verbose=verbose, color="blue", colorize=color)
fb2_begin = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"' + \
'\n xmlns:l="http://www.w3.org/1999/xlink">'
fb2_end = '</FictionBook>'
fb2_desc = f"""
<description>
<title-info>
<genre>sci_business/genre>
<author>
<nickname>{url}</nickname>
</author>
<book-title>{feed}</book-title>
<lang>en</lang>
</title-info>
<document-info>
<author>
<nickname>{url}</nickname>
</author>
<date value="2011-11-11">11.11.2011</date>
<version>3.14</version>
<id>{hash(time()+randint(10000000, 1000000000000))}</id>
</document-info>
</description>
<body>
"""
binary = []
fb2_text = fb2_begin + fb2_desc

stdout_write("Convert news", verbose=verbose, color="blue", colorize=color)
for news in column:
image_links = []
text_links = []
for link in news["links"]:
if "(image)" in link:
image_links += [link[:-8]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Magic number

else:
text_links += [link[:-7]]
images = []
for link in image_links:
img_path = _download_image(link, verbose, sv_path, color)
try:
with open(img_path, 'rb') as binfile:
images += [b64encode(binfile.read()).decode()]
except FileNotFoundError:
pass
article, temp_bin = next_article(id=hash(hash(news["title"]) + randint(1, 10000)),
title=news["title"],
images=images,
date=news["date"],
description=news["text"] +
'links' + "\n".join(text_links),
feed=feed
)
fb2_text += article
binary += temp_bin
stdout_write("Text data converted", verbose=verbose, color="green", colorize=color)
binary = set(binary)
fb2_text += " </body>"
for img in binary:
fb2_text += '\n'+img+'\n'
fb2_text += fb2_end
stdout_write("Add binary part", verbose=verbose, color="green", colorize=color)

file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.fb2"
open(file_path, 'a').close()
with open(file_path, "w") as file:
file.write(fb2_text)
stdout_write("FB2 document created", verbose=verbose, color="green", colorize=color)

def to_html(self, feed, column, sv_path=os.getcwd(), verbose=False, color=False):
"""Function convert data to html and save as file
Params:
feed - rss_channel feed
column - data from rss_channel
sv_path - path for html doc
"""

def next_article(title, images, description, feed, links, date="Unknown"):
"""create html-code for single article"""
return f"""
<div>
<h3>{title}</h3>
{' '.join(f'<img src="{img}" alt="Not found">' for img in images)}
<p>{description}</p>
{' '.join(f'<a href="{link}">link </a>' for link in links)}
<p>Date: {date}</p>
</div>
"""

def create_html(feed, main_part):
return f"""
<!DOCTYPE html>
<html>
<head>
<title>{feed}</title>
</head>
<body>
{main_part}
</body>
</html>
"""

html_text = ""
stdout_write("Creating HTML version", verbose=verbose, color="blue", colorize=color)
for news in column:
image_links = []
text_links = []
for link in news["links"]:
if "(image)" in link:
image_links += [link[:-8]]
else:
text_links += [link[:-7]]
images = []
for link in image_links:
img_path = _download_image(link, verbose, sv_path, color)
images += [img_path]
html_text += next_article(links=text_links,
title=news["title"],
images=images,
date=news["date"],
description=news["text"],
feed=feed
)
html_text = create_html(feed, html_text)
file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.html"
open(file_path, 'a').close()
with open(file_path, "w") as file:
file.write(html_text)
stdout_write("Finish HTML document", verbose=verbose, color="green", colorize=color)
37 changes: 37 additions & 0 deletions project/html_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from html.parser import HTMLParser


class _HTMLTagsParser(HTMLParser):
"""Class using for parsing html-formatted text"""

def __init__(self):
super().__init__()
self.links = []
self.text = ""

def handle_starttag(self, tag, attrs):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Описываю ситуацию. Я открываю пулл реквест, первым делом я вижу вот эту функцию. Я не смотрел остальной код, на данный момент еще не разбирался. Пытаюсь разобраться с этой функцией.
Так вот, я совсем не могу понять, что эта функция делает, для чего она предназначена и т.д., пока я не просмотрю весь код или даже не начну гонять этот код с дебаггером.
Хотя бы подробный док стринг написать, что оно делает, пояснить, какие аргументы приходят (это как минимум)

"""Convert <a> and <img> tags to text form"""
if tag == "img":
num = len(self.links)+1
self.text += "[Image"
for attr in attrs:
if attr[0] == "alt" and attr[1] != "":
self.text += f": {attr[1]}"
elif attr[0] == "src":
self.links += [attr[1] + " (image)"]
self.text += f"][{num}]"
elif tag == "a":
for attr in attrs:
if attr[0] == "href":
self.links += [attr[1] + " (text)"]

def handle_data(self, data):
"""Take text from HTML"""
self.text += data


def parse_HTML(text):
"""Return text without tags or links and a list with links"""
parser = _HTMLTagsParser()
parser.feed(text)
return parser.text, parser.links
37 changes: 37 additions & 0 deletions project/log_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
def stdout_write(string, sep=' ', end='\n', flush=False, verbose=True, color="", colorize=False):
"""Output function for singe string but convert &#39; to '"""
if colorize:
RED = '\033[31m'
BLUE = '\033[34m'
GREEN = '\033[92m'
RESET = '\033[0m'
else:
RED, BLUE, GREEN, RESET = "", "", "", ""

if color == "red":
color = RED
elif color == "blue":
color = BLUE
elif color == "green":
color = GREEN
else:
color, RESET = "", ""

if verbose:
string = string.replace("&#39;", "'")
print(color+string+RESET, sep=sep, end=end, flush=flush)


def write_progressbar(elems, done, length=20):
"""Take arguments
elems: count of elements
done: progress (in elements)
length: progress bar length
Write progress bar to stdout
"""
if done != 0:
print("\r", end="")
col = int(length * (done/elems))
print(f"[{'='*col + ' '*(length-col)}] {int(100*done/elems)}%", end="")
if elems == done:
print()
Loading