1
1
import scrapy
2
-
3
2
from get_cartoon .items import MhgChapterItem
4
3
5
- domain = ' https://www.manhuagui.com'
4
+ domain = " https://www.manhuagui.com"
6
5
7
6
8
7
class ManhuaguiSpider (scrapy .Spider ):
9
8
name = "manhuagui"
10
9
custom_settings = {
11
10
"TWISTED_REACTOR" : "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
12
- ' CONCURRENT_REQUESTS' : 4 ,
13
- ' DOWNLOAD_DELAY' : 3 ,
14
- ' COOKIES_ENABLED' : False ,
15
- ' PLAYWRIGHT_BROWSER_TYPE' : ' chromium' ,
11
+ " CONCURRENT_REQUESTS" : 4 ,
12
+ " DOWNLOAD_DELAY" : 3 ,
13
+ " COOKIES_ENABLED" : False ,
14
+ " PLAYWRIGHT_BROWSER_TYPE" : " chromium" ,
16
15
"DOWNLOAD_HANDLERS" : {
17
16
"https" : "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
18
17
},
19
18
"PLAYWRIGHT_LAUNCH_OPTIONS" : {
20
19
"headless" : True ,
21
20
"timeout" : 15 * 1000 , # 15 seconds
22
- }
21
+ },
23
22
}
24
23
25
24
def __init__ (self , ** kwargs ):
26
- self .allowed_domains = [' manhuagui.com' ]
27
- self .start_urls = [' https://www.manhuagui.com/comic/22265/' ]
25
+ self .allowed_domains = [" manhuagui.com" ]
26
+ self .start_urls = [" https://www.manhuagui.com/comic/22265/" ]
28
27
super ().__init__ (** kwargs )
29
28
30
29
def start_requests (self ):
@@ -41,42 +40,51 @@ def parse(self, response):
41
40
chapter_items = []
42
41
for chapters_selector in chapters_selectors :
43
42
chapter_item = MhgChapterItem ()
44
- chapter_item ['name' ] = chapters_selector .xpath ('a[1]/@title' ).extract_first ()
45
- chapter_item ['url' ] = chapters_selector .xpath ('a[1]/@href' ).extract_first ()
46
- chapter_item ['page_number' ] = chapters_selector .xpath ('a[1]/span/i/text()' ).extract_first ().removesuffix (
47
- 'p' )
48
- chapter_item ['web_image_items' ] = {}
43
+ chapter_item ["name" ] = chapters_selector .xpath (
44
+ "a[1]/@title"
45
+ ).extract_first ()
46
+ chapter_item ["url" ] = chapters_selector .xpath ("a[1]/@href" ).extract_first ()
47
+ chapter_item ["page_number" ] = (
48
+ chapters_selector .xpath ("a[1]/span/i/text()" )
49
+ .extract_first ()
50
+ .removesuffix ("p" )
51
+ )
52
+ chapter_item ["web_image_items" ] = {}
49
53
chapter_items .append (chapter_item )
50
54
51
55
for chapter_item in chapter_items :
52
- yield scrapy .Request (url = f'{ domain } /{ chapter_item ["url" ]} ' ,
53
- meta = {'item' : chapter_item },
54
- callback = self .parse_every_chapter_pages )
56
+ yield scrapy .Request (
57
+ url = f'{ domain } /{ chapter_item ["url" ]} ' ,
58
+ meta = {"item" : chapter_item },
59
+ callback = self .parse_every_chapter_pages ,
60
+ )
55
61
56
62
def parse_every_chapter_pages (self , response ):
57
- chapter_item = response .meta [' item' ]
58
- pages = int (chapter_item [' page_number' ])
63
+ chapter_item = response .meta [" item" ]
64
+ pages = int (chapter_item [" page_number" ])
59
65
for page in range (1 , pages , 1 ):
60
66
page_url = f'{ domain } /{ chapter_item ["url" ]} #p={ str (page )} '
61
- yield scrapy .Request (url = page_url ,
62
- meta = dict (
63
- item = chapter_item ,
64
- current_page = page ,
65
- playwright = True ,
66
- playwright_include_page = True
67
- ),
68
- callback = self .parse_image_url ,
69
- dont_filter = True ,
70
- errback = self .errback_close_page )
67
+ yield scrapy .Request (
68
+ url = page_url ,
69
+ meta = dict (
70
+ item = chapter_item ,
71
+ current_page = page ,
72
+ playwright = True ,
73
+ playwright_include_page = True ,
74
+ ),
75
+ callback = self .parse_image_url ,
76
+ dont_filter = True ,
77
+ errback = self .errback_close_page ,
78
+ )
71
79
72
80
async def parse_image_url (self , response ):
73
81
web_page = response .meta ["playwright_page" ]
74
82
await web_page .close ()
75
83
current_page_number = response .meta ["current_page" ]
76
84
image_path = response .xpath ('//*[@id="mangaFile"]/@src' ).extract_first ()
77
- chapter_item = response .meta [' item' ]
85
+ chapter_item = response .meta [" item" ]
78
86
# {漫画页数:漫画路径}
79
- chapter_item [' web_image_items' ].update ({current_page_number : image_path })
87
+ chapter_item [" web_image_items" ].update ({current_page_number : image_path })
80
88
yield chapter_item
81
89
82
90
async def errback_close_page (self , failure ):
0 commit comments