-
Notifications
You must be signed in to change notification settings - Fork 0
/
regex.py
29 lines (24 loc) · 820 Bytes
/
regex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from bs4 import BeautifulSoup
import lxml
import re
# pattern = re.compile('(.*)\s&\s(.*)')
# pattern = re.compile('\((.*)\s&\s(.*)\)')
# pattern = re.compile('-\s(.*)')
# pattern = re.compile('\|\s(.*)')
# pattern = re.compile('(.*)\s\|\s(.*)')
# pattern = re.compile('\d+\s(.*)\s(“|\")')
# pattern = re.compile('(.*)\son\s')
# pattern = re.compile('\d+\s-\s(.*)\son\s')
# pattern = re.compile('\d+\s-\s(.*)\s\(')
pattern = re.compile('(.*)\s\|.*')
def run():
with open("toscrape.html", encoding='utf8') as fp:
soup = BeautifulSoup(fp, "lxml")
for tag in soup.find_all('a'):
name = tag.string
if name and name != "":
# print(name)
m = re.search(pattern, name)
if m:
print(m.group(1), end=", ")
run()