Skip to content

Shcherbich Yana #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# RSS reader

RSS reader is a command-line utility which receives RSS URL and prints results in human-readable format.

## Specification
<pre>
usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT]
source

Pure Python command-line RSS reader.

positional arguments:
source RSS URL

ooptional arguments:
-h, --help show this help message and exit
--version Print version info
--json Print result as JSON in stdout
--verbose Outputs verbose status messages
--limit LIMIT Limit news topics if this parameter provided
--date DATE Take a date in %Y%m%d format. The news from the specified
day will be printed out.
--to-html TO_HTML Convert news into html and print in stdout. Argument
receives the path where new file will be saved.
--to-fb2 TO_FB2 Convert news into fb2 and print in stdout. Argument
receives the path where new file will be saved.

</pre>

## News caching
The RSS news are stored in a local storage while reading. Local storage is implemented using shelve. The cached news are dicts with the news itself and its row (html) description which are stored by key. The key consists of date and RSS URL. The cashed news can be read with optional argument --date. Utility creates binary db file 'cache.db' in current directory. If you change current directory, db file from previoгs will not be copied to the current directory.

## JSON structure
<pre>
{
"news": {
"feed": "TUT.BY: Новости ТУТ - Главные новости",
"items": [
{
"title": "Охрана, неприкосновенность, пенсия. Канопацкая предлагает закон о гарантиях для экс-президента Беларуси",
"link": "https://news.tut.by/economics/662957.html?utm_campaign=news-feed&utm_medium=rss&utm_source=rss-news",
"date": "Wed, 27 Nov 2019 15:41:00 +0300",
"description": {
"text": "Депутат Анна Канопацкая разработала законопроект «О гарантиях президенту Республики Беларусь, прекратившему исполнение своих полномочий, и членам его семьи» и в ближайшее время внесет его на рассмотрение в Палату представителей.",
"images": [
{
"src": "https://img.tyt.by/thumbnails/n/politika/04/4/c5109116a72e8f8029fecf5ca544c9d4.jpg",
"alt": "Фото: sb.by"
}
],
"links": null
}
}
]
}
}
</pre>
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
feedparser==2.2.1
bs4==0.0.1
dateparser==0.7.2
requests==2.22.0
lxml==4.4.2
Empty file added rss/__init__.py
Empty file.
156 changes: 156 additions & 0 deletions rss/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""This module provides work with cashed news."""

import logging
import shelve
import datetime
import sys
import json

import dateparser

from rss.converter_to_fb2 import Fb2Converter
from rss.converter_to_html import HTMLConverter


class Cache:
"""This class creates cache file, updates it and prints cached news."""

def __init__(self):
logging.info("Cache initialization")
self.db_file_name = 'cache.db'

def _create_key(self, date: str, url: str) -> str:
"""Create key for db."""

logging.info('Create key')
return date + url

def _convert_date(self, date: str) -> str:
"""Convert date to %Y%m%d format."""

logging.info('Convert date')

converted_date = dateparser.parse(date)
if not converted_date:
logging.info("Date isn't clear. Try to parse again")
try:
converted_date = datetime.datetime.strptime(date, "%a, %d %b %Y %X %z")
return converted_date.strftime('%Y%m%d')
except Exception:
raise Exception('Something wrong with date')
return converted_date.strftime('%Y%m%d')

def insert_news(self, news, row_description, url: str):
"""Insert news into cache file.
Create cache file if it doesn't exist.
"""

date = news['date']
key = self._create_key(self._convert_date(date), url)
logging.info("Open db or create if it doesn't exist for inserting news")
with shelve.open(self.db_file_name) as db:
if db.get(key):
logging.info("Update record")
record = db[key]
if not list(record['list_of_news']).count(news):
record['list_of_news'].append(news)
record['list_of_row_descriptions'].append(row_description)
db[key] = record
else:
logging.info("Create new record")
record = {}
record['list_of_news'] = []
record['list_of_news'].append(news)
record['list_of_row_descriptions'] = []
record['list_of_row_descriptions'].append(row_description)
db[key] = record

def _check_entered_date(self, key: str):
"""Check length and characters in entered string"""

logging.info('Check entered date')
if len(key) != 8 or not key.isdigit():
raise ValueError('Invalid entered date')

def _get_news(self, key: str) -> list:
"""Get news from db by key"""

logging.info("Open db or create if it doesn't exist for getting news")
with shelve.open(self.db_file_name) as db:
try:
record = db[key]
return record
except KeyError:
raise Exception("Can't find the news")

def set_printing_news(self, url: str, date: str,
limit: int, json_mode: bool,
fb2_path: str, html_path: str):
"""Set print format"""

logging.info("Set print format")

self._check_entered_date(date)
self._check_limit(limit)

key = self._create_key(date, url)
db = self._get_news(key)

if json_mode:
print(json.dumps(db['list_of_news'][:limit], indent=4, ensure_ascii=False))
else:
self.print_news(db['list_of_news'], limit)

if fb2_path:
conv = Fb2Converter(fb2_path)
conv.convert_to_fb2(db['list_of_news'][:limit])
conv.save_fb2()
if html_path:
conv = HTMLConverter(html_path)
conv.save_html(conv.convert_to_html(db['list_of_news'][:limit],
db['list_of_row_descriptions'][:limit]))

def _check_limit(self, limit):
"""Check if the limit > 0."""

logging.info('Check limit')
if limit is not None and limit <= 0:
raise ValueError('Invalid limit: limit <= 0')

def print_news(self, list_of_news, limit):
"""Print news."""

logging.info('Start printing cached news')
news_number = 1
# check if self.list_of_news consists of 1 element
if type(list_of_news) == dict:
print('№', news_number)
self._print_entries(list_of_news)
else:
for news in list_of_news[:limit]:
print('№', news_number)
news_number += 1
self._print_entries(news)

def _print_entries(self, news: dict):
"""Print one news."""

logging.info('Print one news')
print('Title:', news['title'])
print('Date:', news['date'])
print('Link:', news['link'], '\n')

if news['description']['text'] != 'Nothing':
print(news['description']['text'], '\n')

if news['description']['images']:
print('Images:')
for item in news['description']['images']:
print(item['src'])

if news['description']['links']:
print('Links:')
for item in news['description']['links']:
print(item)

print('-' * 50)
124 changes: 124 additions & 0 deletions rss/converter_to_fb2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""This module converts news to fb2 format and saves."""

import os
import logging
from base64 import b64encode
import xml.etree.ElementTree as tree
from xml.etree.ElementTree import Element
import xml.dom.minidom as minidom

import requests


class Fb2Converter:
"""Class provides work with conversation to fb2."""

def __init__(self, path='rss-news.fb2'):
logging.info('Fb2Converter initialization')
self.path = path
self.root = tree.Element('FictionBook')
self.root.set('xmlns:l', "http://www.w3.org/1999/xlink")
self.description = tree.SubElement(self.root, 'description')
self.body = tree.SubElement(self.root, 'body')

def insert_file_description(self):
"""Insert file description."""

logging.info('Insert description')
title_info = tree.SubElement(self.description, 'title-info')
tree.SubElement(title_info, 'book-title').text = 'RSS news'

def insert_body(self, list_of_news, limit):
"""Insert body."""

logging.info("Insert body")
for news in list_of_news[:limit]:
self.insert_section(news)

def insert_section(self, news):
"""Insert section."""

logging.info('Insert describing single news section')
section = tree.SubElement(self.body, 'section')

self.insert_tag_p(section, news['title'], True)
self.insert_tag_empty_line(section)
self.insert_tag_p(section, 'Link: ' + news['link'])
self.insert_tag_p(section, 'Date: ' + news['date'])
self.insert_tag_empty_line(section)

if news['description']['images']:
try:
for img in news['description']['images']:
self.insert_image(section, img['src'], img['alt'])
except Exception as e:
print("Errors with images: ", e)

self.insert_tag_empty_line(section)
self.insert_tag_p(section, news['description']['text'])

if news['description']['links']:
self.insert_tag_empty_line(section)
self.insert_tag_p(section, 'Links:')
for link in news['description']['links']:
self.insert_tag_p(section, link)

self.insert_tag_empty_line(section)
self.insert_tag_p(section, '-'*50)

def insert_tag_empty_line(self, parent):
"""Insert empty line """

logging.info('Insert empty line')
tree.SubElement(parent, 'empty-line')

def insert_tag_p(self, parent, text, strong_mode=None):
"""
Insert tag p with text.
If strong_mode then text will be bold.
"""

if strong_mode:
logging.info('Insert tag p with ')
tag_p = tree.SubElement(parent, 'p')
tree.SubElement(tag_p, 'strong').text = text
else:
logging.info('Insert tag p')
tree.SubElement(parent, 'p').text = text

def convert_to_fb2(self, news, limit=None):
"""Return news converted into fb2."""

logging.info('Start conversion to fb2')
self.insert_file_description()
self.insert_body(news, limit)

def save_fb2(self):
"""Save fb2 converted news on the received path."""

logging.info('Save fb2 converted news')
with open(self.path, 'w') as file:
file.write(tree.tostring(self.root).decode('UTF-8'))

pretty_xml_as_string = minidom.parse(self.path).toprettyxml()

with open(self.path, 'w') as file:
file.write(pretty_xml_as_string)

def insert_image(self, parent, img_url, img_name):
"""Insert image tag in format: <image l:href="#{img_name}"/>."""

logging.info('Insert image')
image = tree.SubElement(parent, 'image')
image.set('l:href', '#' + img_name)
binary = tree.SubElement(self.root, 'binary')
binary.set('id', img_name)
binary.set('content-type', 'image/png')
binary.text = self.get_binary_img(img_url)

def get_binary_img(self, src):
"""Return img as base64 in string form"""

logging.info('Get binary img')
resource = requests.get(src)
return b64encode(resource.content).decode('UTF-8')
52 changes: 52 additions & 0 deletions rss/converter_to_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""This module converts news to HTML and fb2 and saves."""

import os
import logging

from bs4 import BeautifulSoup
from lxml import html
from lxml import etree
from lxml.builder import E


class HTMLConverter:
"""Class provides work with conversation to HTML."""

def __init__(self, path='rss-news.html'):
logging.info('HTMLConverter initialization')
self.path = path

def convert_to_html(self, list_of_news, list_of_row_descriptions):
"""Return news converted into HTML."""

logging.info('Start conversion to HTML')
page = (
E.html(
E.head(E.title("RSS news")),
)
)

for single_news, single_description in \
zip(list_of_news, list_of_row_descriptions):
logging.info('Convert one news')
page.append(E.P(
E.center(E.h2(single_news['title'])),
E.h2(E.a(single_news['link'], href=single_news['link'])),
E.h4(single_news['date']),
))
page.append(html.fromstring(single_description))
page.append(E.BR())
page.append(E.BR())
page.append(E.HR())
return page

def save_html(self, html_news):
"""Save HTML converted news on the received path."""

logging.info('Save HTML converted news')
with open(self.path, 'w') as file:
file.write(html.tostring(html_news,
pretty_print=True,
encoding='unicode',
method='html',
doctype='<!DOCTYPE html>'))
Loading