Skip to content

Commit 180c119

Browse files
feat: learn pre-commit
1 parent 7b746d3 commit 180c119

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+607
-467
lines changed

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,3 @@ python自学项目
66

77
- 记录python库的使用方法
88
- 以代码记录学到的有趣的内容
9-

acrobatics/ktmm/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ ktmm是一个通过移动鼠标防止系统休眠的脚本。
1818

1919
### 卸载
2020

21-
pyenv virtualenv-delete ktmm
21+
pyenv virtualenv-delete ktmm

acrobatics/ktmm/ktmm.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
from pynput.mouse import Controller
21
import time
32

4-
if __name__ == '__main__':
5-
mouse = Controller()
6-
while True:
7-
mouse.move(0.5, 0.5)
8-
time.sleep(10)
9-
mouse.move(-0.5, -0.5)
10-
time.sleep(10)
3+
from pynput.mouse import Controller
4+
5+
if __name__ == "__main__":
6+
mouse = Controller()
7+
while True:
8+
mouse.move(0.5, 0.5)
9+
time.sleep(10)
10+
mouse.move(-0.5, -0.5)
11+
time.sleep(10)

acrobatics/ktmm/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
pynput
1+
pynput

crawler/my_playwright/README.md

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Playwright
22

3-
Playwright相关demo
3+
Playwright相关demo
44

55

66
## 环境搭建
@@ -35,5 +35,3 @@ hello_world_demo是一个scrapy的简单入门项目
3535
## 卸载项目
3636

3737
pyenv virtualenv-delete my_playwright
38-
39-
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
playwright
1+
playwright

crawler/my_scrapy/README.md

+4-6
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Scrapy
22

3-
Scrapy相关demo
3+
Scrapy相关demo
44

55
## 免责声明
6-
1. 若使用者滥用本项目,本人 无需承担 任何法律责任.
6+
1. 若使用者滥用本项目,本人 无需承担 任何法律责任.
77
2. 本程序仅供娱乐,源码全部开源,禁止滥用和二次贩卖盈利. 禁止用于商业用途.
88

99

@@ -15,7 +15,7 @@ Scrapy相关demo
1515
pyenv virtualenv 3.10.9 my-scrapy //此外,还需要使用编译器的Add Interpreter功能把这个虚拟环境识别了
1616
pyenv activate my-scrapy
1717
python -m pip install --upgrade pip
18-
cd crawler/my_scrapy
18+
cd crawler/my_scrapy
1919
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
2020

2121

@@ -46,7 +46,7 @@ hello_world_demo是一个scrapy的简单入门项目
4646
get_cartoon是一个scrapy的漫画爬虫
4747

4848
#### 相关文档
49-
49+
5050
Scrapy: https://docs.scrapy.org/en/latest/intro/overview.html
5151

5252
#### 启动
@@ -57,5 +57,3 @@ get_cartoon是一个scrapy的漫画爬虫
5757
## 卸载项目
5858

5959
pyenv virtualenv-delete my-scrapy
60-
61-

crawler/my_scrapy/get_cartoon/get_cartoon/middlewares.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,9 @@
33
# See documentation in:
44
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
55

6-
from scrapy import signals
7-
86
# useful for handling different item types with a single interface
9-
from itemadapter import is_item, ItemAdapter
7+
from itemadapter import ItemAdapter, is_item
8+
from scrapy import signals
109

1110

1211
class GetCartoonSpiderMiddleware:

crawler/my_scrapy/get_cartoon/get_cartoon/pipelines.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,13 @@
88
import os
99

1010
import requests
11-
1211
from get_cartoon import settings
1312

1413

1514
class MhgChapterPipeline:
16-
1715
def process_item(self, item, spider):
1816
# 如果获取了图片链接,进行如下操作
19-
web_image_items = item['web_image_items']
17+
web_image_items = item["web_image_items"]
2018
if web_image_items:
2119
# 准备文件夹
2220
local_file_path = f'{settings.IMAGES_STORE}/{item["name"]}'
@@ -25,16 +23,18 @@ def process_item(self, item, spider):
2523

2624
# 获取每一个图片链接
2725
for key, value in web_image_items.items():
28-
image_file_name = f'{str(key)}.jpeg'
26+
image_file_name = f"{str(key)}.jpeg"
2927
# 图片保存路径
30-
full_file_path = f'{local_file_path}/{image_file_name}'
28+
full_file_path = f"{local_file_path}/{image_file_name}"
3129
# 保存图片
3230
self.save_to_local(full_file_path, value)
3331
return item
3432

3533
def save_to_local(self, full_file_path, web_image):
36-
with open(full_file_path, 'wb') as handle:
37-
response = requests.get(url=web_image, headers={'Referer': 'https://www.manhuagui.com/'})
34+
with open(full_file_path, "wb") as handle:
35+
response = requests.get(
36+
url=web_image, headers={"Referer": "https://www.manhuagui.com/"}
37+
)
3838
for block in response.iter_content(1024):
3939
if not block:
4040
break

crawler/my_scrapy/get_cartoon/get_cartoon/settings.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
NEWSPIDER_MODULE = "get_cartoon.spiders"
1515

1616
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17-
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
17+
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
1818

1919
# Obey robots.txt rules
2020
ROBOTSTXT_OBEY = False
@@ -65,7 +65,7 @@
6565
ITEM_PIPELINES = {
6666
"get_cartoon.pipelines.MhgChapterPipeline": 1,
6767
}
68-
IMAGES_STORE = str(pathlib.Path.home()) + '/Downloads/get_cartoon'
68+
IMAGES_STORE = str(pathlib.Path.home()) + "/Downloads/get_cartoon"
6969

7070
# Enable and configure the AutoThrottle extension (disabled by default)
7171
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
@@ -91,4 +91,4 @@
9191
# Set settings whose default value is deprecated to a future-proof value
9292
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
9393
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
94-
FEED_EXPORT_ENCODING = "utf-8"
94+
FEED_EXPORT_ENCODING = "utf-8"

crawler/my_scrapy/get_cartoon/get_cartoon/spiders/manhuagui.py

+39-31
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,29 @@
11
import scrapy
2-
32
from get_cartoon.items import MhgChapterItem
43

5-
domain = 'https://www.manhuagui.com'
4+
domain = "https://www.manhuagui.com"
65

76

87
class ManhuaguiSpider(scrapy.Spider):
98
name = "manhuagui"
109
custom_settings = {
1110
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
12-
'CONCURRENT_REQUESTS': 4,
13-
'DOWNLOAD_DELAY': 3,
14-
'COOKIES_ENABLED': False,
15-
'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
11+
"CONCURRENT_REQUESTS": 4,
12+
"DOWNLOAD_DELAY": 3,
13+
"COOKIES_ENABLED": False,
14+
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
1615
"DOWNLOAD_HANDLERS": {
1716
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
1817
},
1918
"PLAYWRIGHT_LAUNCH_OPTIONS": {
2019
"headless": True,
2120
"timeout": 15 * 1000, # 15 seconds
22-
}
21+
},
2322
}
2423

2524
def __init__(self, **kwargs):
26-
self.allowed_domains = ['manhuagui.com']
27-
self.start_urls = ['https://www.manhuagui.com/comic/22265/']
25+
self.allowed_domains = ["manhuagui.com"]
26+
self.start_urls = ["https://www.manhuagui.com/comic/22265/"]
2827
super().__init__(**kwargs)
2928

3029
def start_requests(self):
@@ -41,42 +40,51 @@ def parse(self, response):
4140
chapter_items = []
4241
for chapters_selector in chapters_selectors:
4342
chapter_item = MhgChapterItem()
44-
chapter_item['name'] = chapters_selector.xpath('a[1]/@title').extract_first()
45-
chapter_item['url'] = chapters_selector.xpath('a[1]/@href').extract_first()
46-
chapter_item['page_number'] = chapters_selector.xpath('a[1]/span/i/text()').extract_first().removesuffix(
47-
'p')
48-
chapter_item['web_image_items'] = {}
43+
chapter_item["name"] = chapters_selector.xpath(
44+
"a[1]/@title"
45+
).extract_first()
46+
chapter_item["url"] = chapters_selector.xpath("a[1]/@href").extract_first()
47+
chapter_item["page_number"] = (
48+
chapters_selector.xpath("a[1]/span/i/text()")
49+
.extract_first()
50+
.removesuffix("p")
51+
)
52+
chapter_item["web_image_items"] = {}
4953
chapter_items.append(chapter_item)
5054

5155
for chapter_item in chapter_items:
52-
yield scrapy.Request(url=f'{domain}/{chapter_item["url"]}',
53-
meta={'item': chapter_item},
54-
callback=self.parse_every_chapter_pages)
56+
yield scrapy.Request(
57+
url=f'{domain}/{chapter_item["url"]}',
58+
meta={"item": chapter_item},
59+
callback=self.parse_every_chapter_pages,
60+
)
5561

5662
def parse_every_chapter_pages(self, response):
57-
chapter_item = response.meta['item']
58-
pages = int(chapter_item['page_number'])
63+
chapter_item = response.meta["item"]
64+
pages = int(chapter_item["page_number"])
5965
for page in range(1, pages, 1):
6066
page_url = f'{domain}/{chapter_item["url"]}#p={str(page)}'
61-
yield scrapy.Request(url=page_url,
62-
meta=dict(
63-
item=chapter_item,
64-
current_page=page,
65-
playwright=True,
66-
playwright_include_page=True
67-
),
68-
callback=self.parse_image_url,
69-
dont_filter=True,
70-
errback=self.errback_close_page)
67+
yield scrapy.Request(
68+
url=page_url,
69+
meta=dict(
70+
item=chapter_item,
71+
current_page=page,
72+
playwright=True,
73+
playwright_include_page=True,
74+
),
75+
callback=self.parse_image_url,
76+
dont_filter=True,
77+
errback=self.errback_close_page,
78+
)
7179

7280
async def parse_image_url(self, response):
7381
web_page = response.meta["playwright_page"]
7482
await web_page.close()
7583
current_page_number = response.meta["current_page"]
7684
image_path = response.xpath('//*[@id="mangaFile"]/@src').extract_first()
77-
chapter_item = response.meta['item']
85+
chapter_item = response.meta["item"]
7886
# {漫画页数:漫画路径}
79-
chapter_item['web_image_items'].update({current_page_number: image_path})
87+
chapter_item["web_image_items"].update({current_page_number: image_path})
8088
yield chapter_item
8189

8290
async def errback_close_page(self, failure):

crawler/my_scrapy/hello_world_demo/hello_world_demo/middlewares.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,9 @@
33
# See documentation in:
44
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
55

6-
from scrapy import signals
7-
86
# useful for handling different item types with a single interface
9-
from itemadapter import is_item, ItemAdapter
7+
from itemadapter import ItemAdapter, is_item
8+
from scrapy import signals
109

1110

1211
class HelloWorldDemoSpiderMiddleware:

crawler/my_scrapy/hello_world_demo/hello_world_demo/settings.py

+26-26
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# USER_AGENT = "hello_world_demo (+http://www.yourdomain.com)"
1717

1818
# 一定要配用户代理 否则无法使用
19-
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
19+
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
2020

2121
# Obey robots.txt rules
2222
ROBOTSTXT_OBEY = True
@@ -27,67 +27,67 @@
2727
# Configure a delay for requests for the same website (default: 0)
2828
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
2929
# See also autothrottle settings and docs
30-
#DOWNLOAD_DELAY = 3
30+
# DOWNLOAD_DELAY = 3
3131
# The download delay setting will honor only one of:
32-
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
33-
#CONCURRENT_REQUESTS_PER_IP = 16
32+
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
33+
# CONCURRENT_REQUESTS_PER_IP = 16
3434

3535
# Disable cookies (enabled by default)
36-
#COOKIES_ENABLED = False
36+
# COOKIES_ENABLED = False
3737

3838
# Disable Telnet Console (enabled by default)
39-
#TELNETCONSOLE_ENABLED = False
39+
# TELNETCONSOLE_ENABLED = False
4040

4141
# Override the default request headers:
42-
#DEFAULT_REQUEST_HEADERS = {
42+
# DEFAULT_REQUEST_HEADERS = {
4343
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
4444
# "Accept-Language": "en",
45-
#}
45+
# }
4646

4747
# Enable or disable spider middlewares
4848
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49-
#SPIDER_MIDDLEWARES = {
49+
# SPIDER_MIDDLEWARES = {
5050
# "hello_world_demo.middlewares.HelloWorldDemoSpiderMiddleware": 543,
51-
#}
51+
# }
5252

5353
# Enable or disable downloader middlewares
5454
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55-
#DOWNLOADER_MIDDLEWARES = {
55+
# DOWNLOADER_MIDDLEWARES = {
5656
# "hello_world_demo.middlewares.HelloWorldDemoDownloaderMiddleware": 543,
57-
#}
57+
# }
5858

5959
# Enable or disable extensions
6060
# See https://docs.scrapy.org/en/latest/topics/extensions.html
61-
#EXTENSIONS = {
61+
# EXTENSIONS = {
6262
# "scrapy.extensions.telnet.TelnetConsole": None,
63-
#}
63+
# }
6464

6565
# Configure item pipelines
6666
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67-
#ITEM_PIPELINES = {
67+
# ITEM_PIPELINES = {
6868
# "hello_world_demo.pipelines.HelloWorldDemoPipeline": 300,
69-
#}
69+
# }
7070

7171
# Enable and configure the AutoThrottle extension (disabled by default)
7272
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73-
#AUTOTHROTTLE_ENABLED = True
73+
# AUTOTHROTTLE_ENABLED = True
7474
# The initial download delay
75-
#AUTOTHROTTLE_START_DELAY = 5
75+
# AUTOTHROTTLE_START_DELAY = 5
7676
# The maximum download delay to be set in case of high latencies
77-
#AUTOTHROTTLE_MAX_DELAY = 60
77+
# AUTOTHROTTLE_MAX_DELAY = 60
7878
# The average number of requests Scrapy should be sending in parallel to
7979
# each remote server
80-
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80+
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
8181
# Enable showing throttling stats for every response received:
82-
#AUTOTHROTTLE_DEBUG = False
82+
# AUTOTHROTTLE_DEBUG = False
8383

8484
# Enable and configure HTTP caching (disabled by default)
8585
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86-
#HTTPCACHE_ENABLED = True
87-
#HTTPCACHE_EXPIRATION_SECS = 0
88-
#HTTPCACHE_DIR = "httpcache"
89-
#HTTPCACHE_IGNORE_HTTP_CODES = []
90-
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
86+
# HTTPCACHE_ENABLED = True
87+
# HTTPCACHE_EXPIRATION_SECS = 0
88+
# HTTPCACHE_DIR = "httpcache"
89+
# HTTPCACHE_IGNORE_HTTP_CODES = []
90+
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
9191

9292
# Set settings whose default value is deprecated to a future-proof value
9393
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"

0 commit comments

Comments
 (0)