-
Notifications
You must be signed in to change notification settings - Fork 32
Dvorakouski Kiryl #37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
f265ddb
df71920
bb229b9
aec5dd7
7d412ef
416704e
e46c7cc
48baada
5a75dfb
4617b7d
751a4d0
3dcb76a
cce26c1
4ddda5c
92c78bf
c2abc7c
f0aa494
ca5588c
5b0ba26
578e9e4
6be9bd3
ea6b17f
96dce68
fa15607
69d093f
6124b87
c68c60f
677b7bf
b60845a
01fb821
fc84232
68ff764
c116d2e
330002b
2d5292c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#One-shot RSS reader | ||
|
||
1. --date - take date in format YYYYMMDD and return cached news with that publication date | ||
2. --to_fb2 - convert output to fb2 format | ||
3. --to_html - convert output to html format | ||
4. --path - choose path for file saving mods |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import sqlite3 | ||
from os.path import exists | ||
import sys | ||
from .log_helper import stdout_write, write_progressbar | ||
|
||
|
||
class Database(): | ||
"""Class working with SQLite3 database""" | ||
|
||
def __init__(self): | ||
super(Database, self).__init__() | ||
if not exists("cache.db"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Советую название файла с кэшем вынести в какую-нибудь переменную, к которой можно обратиться из всех мест, где оно необходимо. Например сделать ее отрибутом класса (либо сделать как аргумент функции init, в таком случае эта функциональность будет более кастомизируема. |
||
conn = sqlite3.connect("cache.db") | ||
cursor = conn.cursor() | ||
cursor.execute(""" | ||
CREATE TABLE `feed` (`source` text unique, `name` text) | ||
""") | ||
cursor.execute(""" | ||
CREATE TABLE "news" ( `source` text, `date` text, | ||
`title` text, `link` text UNIQUE, | ||
`description` text, `links` text ) | ||
""") | ||
conn.commit() | ||
conn.close() | ||
self.conn = None | ||
self.cursor = None | ||
|
||
def _open(self): | ||
self.conn = sqlite3.connect("cache.db") | ||
self.cursor = self.conn.cursor() | ||
|
||
def _close(self): | ||
self.conn.close() | ||
|
||
def write_data(self, data, feed, url, verbose, color): | ||
"""Write news to database | ||
Params: | ||
data: turple - article data | ||
feed: str - rss_channel feed | ||
url: str | ||
verbose: bool | ||
""" | ||
try: | ||
self._open() | ||
counter = 0 | ||
if verbose: | ||
write_progressbar(len(data)+1, counter) | ||
for news in data: | ||
self.cursor.execute(""" | ||
INSERT INTO news | ||
VALUES (?,?,?,?,?,?) | ||
""", news) | ||
counter += 1 | ||
if verbose: | ||
write_progressbar(len(data)+1, counter) | ||
self.conn.commit() | ||
self.cursor.execute(""" | ||
INSERT INTO feed | ||
VALUES (?,?) | ||
""", (url, feed)) | ||
self.conn.commit() | ||
except sqlite3.IntegrityError: | ||
pass | ||
except sqlite3.DatabaseError: | ||
stdout_write("Database error", color="red", colorize=color) | ||
finally: | ||
self._close() | ||
counter = len(data)+1 | ||
if verbose: | ||
write_progressbar(len(data)+1, counter) | ||
|
||
def read_data(self, url, date, color): | ||
"""Get url & date | ||
Return feed & data | ||
""" | ||
feed, data = None, None | ||
try: | ||
self._open() | ||
self.cursor.execute(f""" | ||
SELECT name from feed WHERE source = '{url}' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут есть опасность SQL инъекций (https://ru.wikipedia.org/wiki/%D0%92%D0%BD%D0%B5%D0%B4%D1%80%D0%B5%D0%BD%D0%B8%D0%B5_SQL-%D0%BA%D0%BE%D0%B4%D0%B0) |
||
""") | ||
feed = self.cursor.fetchall() | ||
self.cursor.execute(f""" | ||
SELECT * from news WHERE source = '{url}' and date = '{date}' | ||
""") | ||
data = self.cursor.fetchall() | ||
except Exception as e: | ||
stdout_write(f"Database reading error {e}", color="red", colorize=color) | ||
sys.exit() | ||
finally: | ||
self._close() | ||
return feed, data |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
from .log_helper import stdout_write, write_progressbar | ||
from random import randint | ||
from time import time | ||
from base64 import b64encode | ||
import os | ||
import urllib.request | ||
import urllib.error | ||
|
||
|
||
def _download_image(url, verbose, sv_path, color=False): | ||
"""download image from Internet to your PC""" | ||
stdout_write("Downloading image", verbose=verbose, color="blue", colorize=color) | ||
try: | ||
local_name, headers = urllib.request.urlretrieve( | ||
url, sv_path + '/' + url.split('/')[-1]) | ||
stdout_write(f'Image "{url}" was downloaded.', verbose=verbose, color="green", colorize=color) | ||
return local_name | ||
except (urllib.error.URLError, urllib.error.HTTPError): | ||
stdout_write("Error occurred during downloading image", color="red", colorize=color) | ||
return "" | ||
except ValueError: | ||
stdout_write("Error: image not found", color="red", colorize=color) | ||
return "" | ||
|
||
|
||
class Converter(): | ||
"""Converter class. Convert data to some format""" | ||
|
||
def to_json(self, feed, column, verbose, color): | ||
"""Take data and return it in json""" | ||
stdout_write("Convert to json...", verbose=verbose, color="blue", colorize=color) | ||
counter = 0 | ||
if verbose: | ||
write_progressbar(len(column), counter) | ||
json_text = '{\n "title": "' + feed + '",\n "news": [' | ||
separ = False | ||
for news in column: | ||
if separ: | ||
json_text += ',' | ||
separ = True | ||
json_text += '{\n "title": "' + news['title'] + '",' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Есть модуль json, в котором есть функциональность по конвертированию объектов Python в JSON представление. |
||
if 'date' in news: | ||
json_text += '\n "date": "' + news['date'] + '",' | ||
json_text += '\n "link": "' + news['link'] + '",' | ||
json_text += '\n "description": "' + (news['text']) + '",' | ||
json_text += '\n "links": [' | ||
links = "" | ||
for lin in news['links']: | ||
links += f'\n "{lin}",' | ||
if len(links) != 0: | ||
json_text += links[:-1] + "\n ]" | ||
else: | ||
json_text += ']' | ||
json_text += "\n }" | ||
counter += 1 | ||
if verbose: | ||
write_progressbar(len(column), counter) | ||
json_text += ']\n}' | ||
return json_text | ||
|
||
def to_fb2(self, feed, column, url, sv_path=os.getcwd(), verbose=False, color=False): | ||
"""Function convert data to fb2 and save as file | ||
Params: | ||
feed - rss_channel feed | ||
column - data from rss_channel | ||
sv_path - path for html doc | ||
url - link to source | ||
""" | ||
def next_article(id, title, images, description, feed, date="Unknown"): | ||
"""return code for single article and | ||
binary files for used images | ||
""" | ||
stdout_write("Converting an article...", verbose=verbose, color="blue", colorize=color) | ||
binary = [] | ||
for img in images: | ||
binary += [f'<binary id="{hash(img)}.jpg" content-type="image/jpeg">{img}</binary>'] | ||
return f""" <section id="{id}"> | ||
<title> | ||
<p>{title}</p> | ||
</title> | ||
{' '.join([f'<image l:href="#{hash(img)}.jpg"/>' for img in images])} | ||
<p>{date}</p> | ||
<p>{description}</p> | ||
<p>Source: {feed}</p> | ||
</section> | ||
""", binary | ||
|
||
stdout_write("Creating FB2 file", verbose=verbose, color="blue", colorize=color) | ||
fb2_begin = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ | ||
'<FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0"' + \ | ||
'\n xmlns:l="http://www.w3.org/1999/xlink">' | ||
fb2_end = '</FictionBook>' | ||
fb2_desc = f""" | ||
<description> | ||
<title-info> | ||
<genre>sci_business/genre> | ||
<author> | ||
<nickname>{url}</nickname> | ||
</author> | ||
<book-title>{feed}</book-title> | ||
<lang>en</lang> | ||
</title-info> | ||
<document-info> | ||
<author> | ||
<nickname>{url}</nickname> | ||
</author> | ||
<date value="2011-11-11">11.11.2011</date> | ||
<version>3.14</version> | ||
<id>{hash(time()+randint(10000000, 1000000000000))}</id> | ||
</document-info> | ||
</description> | ||
<body> | ||
""" | ||
binary = [] | ||
fb2_text = fb2_begin + fb2_desc | ||
|
||
stdout_write("Convert news", verbose=verbose, color="blue", colorize=color) | ||
for news in column: | ||
image_links = [] | ||
text_links = [] | ||
for link in news["links"]: | ||
if "(image)" in link: | ||
image_links += [link[:-8]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Magic number |
||
else: | ||
text_links += [link[:-7]] | ||
images = [] | ||
for link in image_links: | ||
img_path = _download_image(link, verbose, sv_path, color) | ||
try: | ||
with open(img_path, 'rb') as binfile: | ||
images += [b64encode(binfile.read()).decode()] | ||
except FileNotFoundError: | ||
pass | ||
article, temp_bin = next_article(id=hash(hash(news["title"]) + randint(1, 10000)), | ||
title=news["title"], | ||
images=images, | ||
date=news["date"], | ||
description=news["text"] + | ||
'links' + "\n".join(text_links), | ||
feed=feed | ||
) | ||
fb2_text += article | ||
binary += temp_bin | ||
stdout_write("Text data converted", verbose=verbose, color="green", colorize=color) | ||
binary = set(binary) | ||
fb2_text += " </body>" | ||
for img in binary: | ||
fb2_text += '\n'+img+'\n' | ||
fb2_text += fb2_end | ||
stdout_write("Add binary part", verbose=verbose, color="green", colorize=color) | ||
|
||
file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.fb2" | ||
open(file_path, 'a').close() | ||
with open(file_path, "w") as file: | ||
file.write(fb2_text) | ||
stdout_write("FB2 document created", verbose=verbose, color="green", colorize=color) | ||
|
||
def to_html(self, feed, column, sv_path=os.getcwd(), verbose=False, color=False): | ||
"""Function convert data to html and save as file | ||
Params: | ||
feed - rss_channel feed | ||
column - data from rss_channel | ||
sv_path - path for html doc | ||
""" | ||
|
||
def next_article(title, images, description, feed, links, date="Unknown"): | ||
"""create html-code for single article""" | ||
return f""" | ||
<div> | ||
<h3>{title}</h3> | ||
{' '.join(f'<img src="{img}" alt="Not found">' for img in images)} | ||
<p>{description}</p> | ||
{' '.join(f'<a href="{link}">link </a>' for link in links)} | ||
<p>Date: {date}</p> | ||
</div> | ||
""" | ||
|
||
def create_html(feed, main_part): | ||
return f""" | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<title>{feed}</title> | ||
</head> | ||
<body> | ||
{main_part} | ||
</body> | ||
</html> | ||
""" | ||
|
||
html_text = "" | ||
stdout_write("Creating HTML version", verbose=verbose, color="blue", colorize=color) | ||
for news in column: | ||
image_links = [] | ||
text_links = [] | ||
for link in news["links"]: | ||
if "(image)" in link: | ||
image_links += [link[:-8]] | ||
else: | ||
text_links += [link[:-7]] | ||
images = [] | ||
for link in image_links: | ||
img_path = _download_image(link, verbose, sv_path, color) | ||
images += [img_path] | ||
html_text += next_article(links=text_links, | ||
title=news["title"], | ||
images=images, | ||
date=news["date"], | ||
description=news["text"], | ||
feed=feed | ||
) | ||
html_text = create_html(feed, html_text) | ||
file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.html" | ||
open(file_path, 'a').close() | ||
with open(file_path, "w") as file: | ||
file.write(html_text) | ||
stdout_write("Finish HTML document", verbose=verbose, color="green", colorize=color) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from html.parser import HTMLParser | ||
|
||
|
||
class _HTMLTagsParser(HTMLParser): | ||
"""Class using for parsing html-formatted text""" | ||
|
||
def __init__(self): | ||
super().__init__() | ||
self.links = [] | ||
self.text = "" | ||
|
||
def handle_starttag(self, tag, attrs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Описываю ситуацию. Я открываю пулл реквест, первым делом я вижу вот эту функцию. Я не смотрел остальной код, на данный момент еще не разбирался. Пытаюсь разобраться с этой функцией. |
||
"""Convert <a> and <img> tags to text form""" | ||
if tag == "img": | ||
num = len(self.links)+1 | ||
self.text += "[Image" | ||
for attr in attrs: | ||
if attr[0] == "alt" and attr[1] != "": | ||
self.text += f": {attr[1]}" | ||
elif attr[0] == "src": | ||
self.links += [attr[1] + " (image)"] | ||
self.text += f"][{num}]" | ||
elif tag == "a": | ||
for attr in attrs: | ||
if attr[0] == "href": | ||
self.links += [attr[1] + " (text)"] | ||
|
||
def handle_data(self, data): | ||
"""Take text from HTML""" | ||
self.text += data | ||
|
||
|
||
def parse_HTML(text): | ||
"""Return text without tags or links and a list with links""" | ||
parser = _HTMLTagsParser() | ||
parser.feed(text) | ||
return parser.text, parser.links |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
def stdout_write(string, sep=' ', end='\n', flush=False, verbose=True, color="", colorize=False): | ||
"""Output function for singe string but convert ' to '""" | ||
if colorize: | ||
RED = '\033[31m' | ||
BLUE = '\033[34m' | ||
GREEN = '\033[92m' | ||
RESET = '\033[0m' | ||
else: | ||
RED, BLUE, GREEN, RESET = "", "", "", "" | ||
|
||
if color == "red": | ||
color = RED | ||
elif color == "blue": | ||
color = BLUE | ||
elif color == "green": | ||
color = GREEN | ||
else: | ||
color, RESET = "", "" | ||
|
||
if verbose: | ||
string = string.replace("'", "'") | ||
print(color+string+RESET, sep=sep, end=end, flush=flush) | ||
|
||
|
||
def write_progressbar(elems, done, length=20): | ||
"""Take arguments | ||
elems: count of elements | ||
done: progress (in elements) | ||
length: progress bar length | ||
Write progress bar to stdout | ||
""" | ||
if done != 0: | ||
print("\r", end="") | ||
col = int(length * (done/elems)) | ||
print(f"[{'='*col + ' '*(length-col)}] {int(100*done/elems)}%", end="") | ||
if elems == done: | ||
print() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
В данном случае эта строка не нужна.