-
Notifications
You must be signed in to change notification settings - Fork 1
/
jav321.py
138 lines (100 loc) · 3.17 KB
/
jav321.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
from bs4 import BeautifulSoup
from lxml import html
from ADC_function import post_html
def main(number: str) -> json:
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
if "/video/" in result.url:
data = parse_info(soup)
dic = {
"title": get_title(lx),
"studio": "",
"year": get_year(data),
"outline": get_outline(lx),
"director": "",
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "jav321.py",
**data,
}
else:
dic = {}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
def parse_info(soup: BeautifulSoup) -> dict:
data = soup.select_one("div.row > div.col-md-9")
if data:
dd = str(data).split("<br/>")
data_dic = {}
for d in dd:
data_dic[get_bold_text(h=d)] = d
return {
"actor": get_actor(data_dic),
"label": get_label(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic),
}
else:
return {}
def get_bold_text(h: str) -> str:
soup = BeautifulSoup(h, "html.parser")
if soup.b:
return soup.b.text
else:
return "UNKNOWN_TAG"
def get_anchor_info(h: str) -> str:
result = []
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
for d in data:
result.append(d.text)
return ",".join(result)
def get_text_info(h: str) -> str:
return h.split(": ")[1]
def get_cover(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
def get_actor(data: hash) -> str:
if "女优" in data:
return get_anchor_info(data["女优"])
else:
return ""
def get_label(data: hash) -> str:
if "片商" in data:
return get_anchor_info(data["片商"])
else:
return ""
def get_tag(data: hash) -> str:
if "标签" in data:
return get_anchor_info(data["标签"])
else:
return ""
def get_number(data: hash) -> str:
if "番号" in data:
return get_text_info(data["番号"])
else:
return ""
def get_release(data: hash) -> str:
if "发行日期" in data:
return get_text_info(data["发行日期"])
else:
return ""
def get_runtime(data: hash) -> str:
if "播放时长" in data:
return get_text_info(data["播放时长"])
else:
return ""
def get_year(data: hash) -> str:
if "release" in data:
return data["release"][:4]
else:
return ""
if __name__ == "__main__":
print(main("wmc-002"))