[UPDATE] sth.

jacksonchen1998 · Jun 17, 2023 · b57b022 · b57b022
1 parent fd10b03
commit b57b022
Show file tree

Hide file tree

Showing 11 changed files with 218 additions and 126 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+Weather/chromedriver
diff --git a/README.md b/README.md
@@ -1,127 +1,4 @@
-# Twitter-Crawler
+# Crawler
 
-Crawling Twitter's reply and save their contents and likes
-
-## Dataset
-
-### [Famous Words Twitter Dataset](https://www.kaggle.com/datasets/jackksoncsie/twitter-dataset-keywords-likes-and-tweets)
-
-|Name|type|Description|
-|:---:|:---:|:---:|
-|keyword|`str`|Keyword of the tweet|
-|likes|`int`|Number of likes|
-|tweet|`str`|Content of the tweet|
-
-## Usage
-
-Top 20 keywords in 2021, each keyword has `5000` tweets
-```
-"COVID-19",
-"Vaccine",
-"Zoom",
-"Bitcoin",
-"Dogecoin",
-"NFT",
-"Elon Musk",
-"Tesla",
-"Amazon",
-"iPhone 12",
-"Remote work",
-"TikTok",
-"Instagram",
-"Facebook",
-"YouTube",
-"Netflix",
-"GameStop",
-"Super Bowl",
-"Olympics",
-"Black Lives Matter"
-"India vs England",
-"Ukraine",
-"Queen Elizabeth",
-"World Cup",
-"Jeffrey Dahmer",
-"Johnny Depp",
-"Will Smith",
-"Weather",
-"xvideo",
-"porn",
-"nba",
-"Macdonald",
-```
-
-![](./image/tweet.png)
-
-### [Famous Words Twitter Reply Dataset](https://www.kaggle.com/datasets/jackksoncsie/famous-keyword-twitter-replies-dataset?rvi=1)
-
-|Name|type|Description|
-|:---:|:---:|:---:|
-|keyword|`str`|Keyword of the tweet|
-|main_tweet|`str`|Content of the tweet|
-|main_likes|`int`|Number of likes of the tweet|
-|reply|`str`|Content of the reply|
-|reply_likes|`int`|Number of likes of the reply|
-
-```
-search_terms = [
-    "COVID-19",
-    "Vaccine",
-    "Zoom",
-    "Bitcoin",
-    "Dogecoin",
-    "NFT",
-    "Elon Musk",
-    "Tesla",
-    "Amazon",
-    "iPhone 12",
-    "Remote work",
-    "TikTok",
-    "Instagram",
-    "Facebook",
-    "YouTube",
-    "Netflix",
-    "GameStop",
-    "Super Bowl",
-    "Olympics",
-    "Black Lives Matter"
-    "Ukraine",
-    "Queen Elizabeth",
-    "World Cup",
-    "weather",
-    "nba",
-    "Macdonald",
-    "K-pop",
-    "music",
-    "movie",
-    "sport",
-    "news",
-    "science",
-]
-```
-
-![](./image/reply.png)
-
-## Other program
-
-- `count.py` : Count the number of tweets and replies
-- `check.py` : Check json file format
-- `search.py` : Count the number of likes of each reply
-
-## Method
-
-Using [Snscrape](https://github.com/JustAnotherArchivist/snscrape)
-
-Install Snscrape
-
-`pip3 install snscrape`
-
-Development version
-
-`pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git`
-
-## Reference
-
-- [Web Scraping with Python – How to Scrape Data from Twitter using Tweepy and Snscrape](https://www.freecodecamp.org/news/python-web-scraping-tutorial/)
-- [Tweepy](https://github.com/tweepy/tweepy)
-- [Snscrape](https://github.com/JustAnotherArchivist/snscrape)
-- [Twitter Developer](https://developer.twitter.com/en)
+- [Twitter Crawler](./twitter/README.md)
+- [Ccentral weather bureau Crawler](./weather/README.md)
diff --git a/Twitter/README.md b/Twitter/README.md
@@ -0,0 +1,127 @@
+# Twitter-Crawler
+
+Crawling Twitter's reply and save their contents and likes
+
+## Dataset
+
+### [Famous Words Twitter Dataset](https://www.kaggle.com/datasets/jackksoncsie/twitter-dataset-keywords-likes-and-tweets)
+
+|Name|type|Description|
+|:---:|:---:|:---:|
+|keyword|`str`|Keyword of the tweet|
+|likes|`int`|Number of likes|
+|tweet|`str`|Content of the tweet|
+
+## Usage
+
+Top 20 keywords in 2021, each keyword has `5000` tweets
+```
+"COVID-19",
+"Vaccine",
+"Zoom",
+"Bitcoin",
+"Dogecoin",
+"NFT",
+"Elon Musk",
+"Tesla",
+"Amazon",
+"iPhone 12",
+"Remote work",
+"TikTok",
+"Instagram",
+"Facebook",
+"YouTube",
+"Netflix",
+"GameStop",
+"Super Bowl",
+"Olympics",
+"Black Lives Matter"
+"India vs England",
+"Ukraine",
+"Queen Elizabeth",
+"World Cup",
+"Jeffrey Dahmer",
+"Johnny Depp",
+"Will Smith",
+"Weather",
+"xvideo",
+"porn",
+"nba",
+"Macdonald",
+```
+
+![](./image/tweet.png)
+
+### [Famous Words Twitter Reply Dataset](https://www.kaggle.com/datasets/jackksoncsie/famous-keyword-twitter-replies-dataset?rvi=1)
+
+|Name|type|Description|
+|:---:|:---:|:---:|
+|keyword|`str`|Keyword of the tweet|
+|main_tweet|`str`|Content of the tweet|
+|main_likes|`int`|Number of likes of the tweet|
+|reply|`str`|Content of the reply|
+|reply_likes|`int`|Number of likes of the reply|
+
+```
+search_terms = [
+    "COVID-19",
+    "Vaccine",
+    "Zoom",
+    "Bitcoin",
+    "Dogecoin",
+    "NFT",
+    "Elon Musk",
+    "Tesla",
+    "Amazon",
+    "iPhone 12",
+    "Remote work",
+    "TikTok",
+    "Instagram",
+    "Facebook",
+    "YouTube",
+    "Netflix",
+    "GameStop",
+    "Super Bowl",
+    "Olympics",
+    "Black Lives Matter"
+    "Ukraine",
+    "Queen Elizabeth",
+    "World Cup",
+    "weather",
+    "nba",
+    "Macdonald",
+    "K-pop",
+    "music",
+    "movie",
+    "sport",
+    "news",
+    "science",
+]
+```
+
+![](./image/reply.png)
+
+## Other program
+
+- `count.py` : Count the number of tweets and replies
+- `check.py` : Check json file format
+- `search.py` : Count the number of likes of each reply
+
+## Method
+
+Using [Snscrape](https://github.com/JustAnotherArchivist/snscrape)
+
+Install Snscrape
+
+`pip3 install snscrape`
+
+Development version
+
+`pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git`
+
+## Reference
+
+- [Web Scraping with Python – How to Scrape Data from Twitter using Tweepy and Snscrape](https://www.freecodecamp.org/news/python-web-scraping-tutorial/)
+- [Tweepy](https://github.com/tweepy/tweepy)
+- [Snscrape](https://github.com/JustAnotherArchivist/snscrape)
+- [Twitter Developer](https://developer.twitter.com/en)
diff --git a/check.py → Twitter/check.py b/check.py → Twitter/check.py
diff --git a/count.py → Twitter/count.py b/count.py → Twitter/count.py
diff --git a/crawl/main.py → Twitter/crawl/main.py b/crawl/main.py → Twitter/crawl/main.py
diff --git a/crawl/reply.py → Twitter/crawl/reply.py b/crawl/reply.py → Twitter/crawl/reply.py
diff --git a/search.py → Twitter/search.py b/search.py → Twitter/search.py
diff --git a/Weather/README.md b/Weather/README.md
@@ -0,0 +1,4 @@
+# Centeral Weather Bureau Crawler
+
+Main website: [中央氣象局](https://www.cwb.gov.tw/V8/C/W/week.html)
+The website which I crawled: [Monthly Data](https://www.cwb.gov.tw/V8/E/C/Statistics/monthlydata.html)
diff --git a/Weather/main.py b/Weather/main.py
@@ -0,0 +1,83 @@
+# I want to crawl a website https://www.cwb.gov.tw/V8/E/C/Statistics/monthlydata.html
+# It has one select year and one select month
+# The main data comes from div class="col-md-12"
+# crawl the data and save it to a csv file
+# column will be: Date, Station, Average Temperature, Precipitation, Humidity, Wind Speed, Sunshine Duration, Mean Pressure
+# Date format: YYYY-MM, from 2010-01 to 2020-12
+
+import requests
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+import random
+import csv
+
+# set the year and month
+def set_year_month(year, month):
+    # set the year
+    year_select = driver.find_element_by_id("year")
+    year_options = year_select.find_elements_by_tag_name("option")
+    for option in year_options:
+        if option.text == year:
+            option.click()
+            break
+    # set the month
+    month_select = driver.find_element_by_id("month")
+    month_options = month_select.find_elements_by_tag_name("option")
+    for option in month_options:
+        if option.text == month:
+            option.click()
+            break
+    # click the search button
+    search_button = driver.find_element_by_id("button")
+    search_button.click()
+    # wait for the page to load
+    time.sleep(random.randint(1, 3))
+
+# get first row data
+def get_first_row_data():
+    # get the first row data
+    first_row = driver.find_element_by_xpath("//div[@class='col-md-12']/table/tbody/tr[1]")
+    first_row_data = first_row.find_elements_by_tag_name("td")
+    # get the date
+    date = first_row_data[0].text
+    # get the station
+    station = first_row_data[1].text
+    # get the average temperature
+    average_temperature = first_row_data[2].text
+    # get the precipitation
+    precipitation = first_row_data[3].text
+    # get the humidity
+    humidity = first_row_data[4].text
+    # get the wind speed
+    wind_speed = first_row_data[5].text
+    # get the sunshine duration
+    sunshine_duration = first_row_data[6].text
+    # get the mean pressure
+    mean_pressure = first_row_data[7].text
+    # return the data
+    return date, station, average_temperature, precipitation, humidity, wind_speed, sunshine_duration, mean_pressure
+
+if __name__ == "__main__":
+    # set the url
+    url = "https://www.cwb.gov.tw/V8/E/C/Statistics/monthlydata.html"
+    # set the driver
+    driver = webdriver.Chrome()
+    driver.get(url)
+    # set the year and month
+    year = "2010"
+    month = "01"
+    set_year_month(year, month)
+    # get the first row data
+    date, station, average_temperature, precipitation, humidity, wind_speed, sunshine_duration, mean_pressure = get_first_row_data()
+    # set the data
+    data = [date, station, average_temperature, precipitation, humidity, wind_speed, sunshine_duration, mean_pressure]
+    # set the csv file
+    csv_file = open("weather.csv", "w", newline="")
+    writer = csv.writer(csv_file)
+    writer.writerow(["Date", "Station", "Average Temperature", "Precipitation", "Humidity", "Wind Speed", "Sunshine Duration", "Mean Pressure"])
+    writer.writerow(data)
+    csv_file.close()
+    # close the driver
+    driver.close()