-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmlparser.py
70 lines (54 loc) · 1.74 KB
/
xmlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
import xml.etree.ElementTree as ET
import urllib.parse
from getDetail import getDetail
from bs4 import BeautifulSoup
from datetime import datetime
# XML parser
def requestXML(url):
req = requests.get(url, timeout=5)
root = ET.fromstring(req.text)
return root
def getEntityGoogle(xmlRoot):
children = list(xmlRoot)
Entity = children[4:]
return Entity
# example
def removeHTMLTags(text):
result = text
removeElements = ["<b>", "</b>", """, "&", "'"]
for removeElement in removeElements:
result = result.replace(removeElement, "")
return result
def getTitleAndLinkGoogle(xmlRoot):
children = getEntityGoogle(xmlRoot)
result = []
for child in children:
entry = {"title": "", "date": "", "link": "", "detail": None}
# title
title = child[1].text
title = removeHTMLTags(title)
print("title:\t", title)
# date
strpublished = child[3].text
published = datetime.strptime(strpublished, "%Y-%m-%dT%H:%M:%SZ")
print("date:\t", published)
# link
url = child[2].attrib["href"]
encodedURL = url[url.find("url=") + 4 : url.find("&ct=ga")]
originurl = urllib.parse.unquote(encodedURL)
print("link:\t", originurl)
# print("detail: ")
# detail
content = getDetail(originurl)
# print(content)
entry["title"] = title
entry["date"] = published
entry["link"] = originurl
entry["detail"] = content
result.append(entry)
return result
def getContent(url):
root = requestXML(url)
result = getTitleAndLinkGoogle(root)
return result