Skip to content

Pashkevich Anton #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions final_task/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (c) 2019 The Python Packaging Authority

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
1 change: 1 addition & 0 deletions final_task/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include requirements.txt
44 changes: 44 additions & 0 deletions final_task/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
##### JSON structure

```
{
"news": {
"feed": "Yahoo News - Latest News & Headlines",
"publications": [
{
"title": "Stefanik embraces spotlight at impeachment hearings",
"pub_date": "Fri, 15 Nov 2019 17:55:51 -0500",
"link": "https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html",
"description": "[image 2: Stefanik embraces spotlight at impeachment hearings] [2]\nThe second day of the impeachment inquiry\u2019s public hearings, on Friday, began the same way\nas the first: with an attempt by Rep. Elise Stefanik, a New York Republican, to interrupt proceedings\nwith a procedural objection.",
"hrefs": [
[
"https://news.yahoo.com/stefanik-embraces-spotlight-at-impeachment-hearings-225551297.html",
"link"
],
[
"http://l.yimg.com/uu/api/res/1.2/NRuDo56c6EiwjZH4WOqEZg--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media-mbst-pub-ue1.s3.amazonaws.com/creatr-uploaded-images/2019-11/7a1d0760-07d6-11ea-bef7-f17150574bb2",
"image",
"Stefanik embraces spotlight at impeachment hearings"
]
]
}
]
}
}
```

##### Cache description

News received from feed is cached through database is being created locally.

The database consists of the only file named "cache.db". It has the following structure:

| | id | feed | title | pub_date | pub_parsed | link | description | hrefs |
|-----|------|------|-------|----------|------------|------|-------------|-------|
|post | .. | ... | ... | ... | ... | ... | ... | ... |

All fields except "id" have text type. ID field plays a role of post primary key.

Hrefs field is composed of all post links including image links and image descriptions.
Usual references section and one for image links are separated by --|-- sequence.
Items in one section are separated by -+- sequence. And -|- is for dividing link, it's type and image description.
4 changes: 4 additions & 0 deletions final_task/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
feedparser
bs4
fpdf
requests
Empty file.
93 changes: 93 additions & 0 deletions final_task/rss_reader/cacher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
this module provides tools for caching news

it includes functions for work with database and support ones
"""

import sqlite3
from re import match

def init_database():
"""
this function creates and initizlizes database for caching news
"""
connection_obj = sqlite3.connect('cache.db')
cursor_obj = connection_obj.cursor()
cursor_obj.execute(
'''CREATE TABLE IF NOT EXISTS cache (id integer primary key, feed text, title text, pub_date text, pub_parsed text, link text, description text, hrefs text)'''
)
connection_obj.commit()

return connection_obj, cursor_obj

def cache_news(connection_obj, cursor_obj, news):
"""
this function adds parsed news in database
"""
for post in news:
cursor_obj.execute(
'''SELECT id FROM cache WHERE feed=? AND title=? AND pub_date=? AND pub_parsed=? AND link=? AND description=? AND hrefs=?''',
(post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs']))
)
if cursor_obj.fetchone() is None:
cursor_obj.execute(
'''INSERT INTO cache (feed, title, pub_date, pub_parsed, link, description, hrefs) VALUES (?, ?, ?, ?, ?, ?, ?)''',
(post['feed'], post['title'], post['pub_date'], post['pub_parsed'], post['link'], post['description'], hrefs_to_text(post['hrefs']))
)
connection_obj.commit()

return

def get_cached_news(cursor_obj, date):
"""
this function fetches news from database and return them as a list
"""
cursor_obj.execute('''SELECT * FROM cache WHERE pub_parsed=?''', (date, ))
rows = cursor_obj.fetchall()

news = []
for row in rows:
data = {}
data['feed'] = row[1]
data['title'] = row[2]
data['pub_date'] = row[3]
data['pub_parsed'] = row[4]
data['link'] = row[5]
data['description'] = row[6]

hrefs = row[7].split("--|--")
try:
data['hrefs'] = [tuple(item.split("-|-")) for item in hrefs[0].split("-+-") if item != '']
data['hrefs'] += [tuple(item.split("-|-")) for item in hrefs[1].split("-+-") if item != '']
except IndexError:
pass
news.append(data)

return news

def hrefs_to_text(link_list):
"""
this function represents the list of links connected to post to text form
"""
res_line = ''
ind = -1
for tpl in link_list:
if tpl[1] != 'image':
res_line += f"-+-{tpl[0]}-|-{tpl[1]}"
else:
res_line += '--|--'
ind = link_list.index(tpl)
break

if ind != -1:
for tpl in link_list[ind:]:
res_line += f"{tpl[0]}-|-{tpl[1]}-|-{tpl[2]}-+-"

return res_line

def is_valid_date(line):
"""
this function checks a date parameter for suiting date format
"""
date = r"^[1-2][0-9]{3}[0-1][0-9][0-3][0-9]$"
return match(date, line)
169 changes: 169 additions & 0 deletions final_task/rss_reader/format_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""
this module provides tools for converting news to html and pdf formats
"""

import os
import shutil
import requests
from fpdf import FPDF

def break_lines(text):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возможно я ошибаюсь, но может ли в данном случае подойти метод replace у строки?

"""
this function replaces '\n' to <br> tags
"""
i = 0
while True:
try:
while text[i] != '\n':
i += 1
text = text[:i] + "<br>" + text[i + 1:]
i += 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень много магических чисел :)

except IndexError:
break

return text

def to_html(news, filepath):
"""
this function prints news in html format to file
"""
with open(filepath, "w", encoding='utf-8') as f:
f.write('''
<html lang="en" dir="ltr">
<head>
<title>rss_reader</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<style>
ul>li{
list-style: none;
border: 1px solid;
margin-top: 20px;
padding: 10px;
}
ul>li>p:nth-child(1){
font-size: 35px;
}

ul>li{
border-radius: 10px;
box-shadow: 1px 1px 10px black;
}
</style>
</head>
<body>
<div class="container">
<h1 style="text-align: center">Actual News</h1>
<ul>''')
for post in news:
f.write(f'''
<li>
<p>Feed: {post['feed']}</p>
<p>Title: {post['title']}</p>
<p>Publication date: {post['pub_date']}</p>
<p>Link: <a href = "{post['link']}">{post['link']}</a></p>
<p>{break_lines(post['description'])}</p>
<p>Links:</p>
<ol>''')
for tpl in post['hrefs']:
if not tpl[1] == 'image':
f.write(f'''
<li>
<p><a href = "{tpl[0]}">{tpl[0]}</a></p>
</li>''')
else:
f.write(f'''
<li>
<p>{tpl[2]}<br><a href = "{tpl[0]}"><img src = "{tpl[0]}"></a></p>
</li>''')
f.write('''
</ol>
</li>''')
f.write('''
</ul>
</div>
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
</body>
</html>''')

class user_FPDF(FPDF):
"""
a small inherited class providing an ability to enumerate pages
"""
def footer(self):
self.set_y(-15)
self.cell(0, 10, txt=f"{self.page_no()}", align='R')

def download_image(url, dest_filepath):
"""
this function downloads an image from url and saves it in file
"""
with open(dest_filepath, 'wb') as f:
response = requests.get(url, stream=True)
for block in response.iter_content(1024):
if not block:
break
f.write(block)

def to_pdf(news, filepath):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это достаточно большая функция. Есть смысл разделить ее на несколько маленьких

"""
this function prints news in pdf format to file
"""
current_directory = os.getcwd()
final_directory = os.path.join(current_directory, "tmp_files")
if not os.path.exists(final_directory):
os.mkdir(final_directory)

pdf_obj= user_FPDF()
font_dir = os.path.join(final_directory, 'DejaVuSansCondensed.ttf')
with open(font_dir, "wb") as f:
f.write(requests.get("https://raw.github.com/prague15031939/font_storage/master/DejaVuSansCondensed.ttf").content)
pdf_obj.add_font('DejaVu', '', font_dir, uni=True)
image_id = 0

for ind, post in enumerate(news):
pdf_obj.add_page()
if ind == 0:
pdf_obj.set_font('Arial', style='B', size=16)
pdf_obj.cell(200, 15, txt='ACTUAL NEWS', align='C', ln=1)
pdf_obj.set_font('DejaVu', '', 12)
pdf_obj.cell(5, 5, txt="#")
pdf_obj.cell(180, 5, txt=f"Feed: {(post['feed'])}", ln=1)
pdf_obj.cell(200, 5, ln=1)
pdf_obj.cell(5, 5)
pdf_obj.multi_cell(180, 5, txt=f"Title: {(post['title'])}")
pdf_obj.cell(5, 5)
pdf_obj.cell(200, 5, txt=f"Publication date: {post['pub_date']}", ln=1)
pdf_obj.cell(5, 5)
pdf_obj.cell(10, 5, txt='Link: ')
pdf_obj.set_font('Arial', style='I', size=12)
pdf_obj.multi_cell(180, 5, txt=f"{post['link']}")
pdf_obj.set_font('DejaVu', '', 12)
pdf_obj.cell(200, 5, ln=1)
pdf_obj.cell(5, 5)
pdf_obj.multi_cell(200, 5, txt=f"{post['description']}")
pdf_obj.cell(200, 5, ln=1)
pdf_obj.cell(5, 5)
pdf_obj.cell(200, 5, txt=f"Links:", ln=1)

for index, tpl in enumerate(post['hrefs']):
pdf_obj.cell(10, 5)
if not tpl[1] == 'image':
pdf_obj.set_font('DejaVu', '', 12)
pdf_obj.cell(7, 5, txt=f"[{index + 1}] ")
pdf_obj.set_font('Arial', style='I', size=12)
pdf_obj.multi_cell(170, 5, txt=f"{tpl[0]}")
else:
pdf_obj.set_font('DejaVu', '', 12)
pdf_obj.multi_cell(170, 5, txt=f"[{index + 1}] {tpl[2]}")
try:
img_dir = os.path.join(final_directory, f"{image_id}.jpeg")
download_image(tpl[0], img_dir)
pdf_obj.image(img_dir, x=22, y=pdf_obj.get_y()+5, link=tpl[0])
image_id += 1
except RuntimeError:
pass

pdf_obj.output(filepath)
shutil.rmtree(final_directory)
Loading