-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fd10b03
commit b57b022
Showing
11 changed files
with
218 additions
and
126 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Weather/chromedriver |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,127 +1,4 @@ | ||
# Twitter-Crawler | ||
# Crawler | ||
|
||
Crawling Twitter's reply and save their contents and likes | ||
|
||
## Dataset | ||
|
||
### [Famous Words Twitter Dataset](https://www.kaggle.com/datasets/jackksoncsie/twitter-dataset-keywords-likes-and-tweets) | ||
|
||
|Name|type|Description| | ||
|:---:|:---:|:---:| | ||
|keyword|`str`|Keyword of the tweet| | ||
|likes|`int`|Number of likes| | ||
|tweet|`str`|Content of the tweet| | ||
|
||
## Usage | ||
|
||
Top 20 keywords in 2021, each keyword has `5000` tweets | ||
``` | ||
"COVID-19", | ||
"Vaccine", | ||
"Zoom", | ||
"Bitcoin", | ||
"Dogecoin", | ||
"NFT", | ||
"Elon Musk", | ||
"Tesla", | ||
"Amazon", | ||
"iPhone 12", | ||
"Remote work", | ||
"TikTok", | ||
"Instagram", | ||
"Facebook", | ||
"YouTube", | ||
"Netflix", | ||
"GameStop", | ||
"Super Bowl", | ||
"Olympics", | ||
"Black Lives Matter" | ||
"India vs England", | ||
"Ukraine", | ||
"Queen Elizabeth", | ||
"World Cup", | ||
"Jeffrey Dahmer", | ||
"Johnny Depp", | ||
"Will Smith", | ||
"Weather", | ||
"xvideo", | ||
"porn", | ||
"nba", | ||
"Macdonald", | ||
``` | ||
|
||
![](./image/tweet.png) | ||
|
||
### [Famous Words Twitter Reply Dataset](https://www.kaggle.com/datasets/jackksoncsie/famous-keyword-twitter-replies-dataset?rvi=1) | ||
|
||
|Name|type|Description| | ||
|:---:|:---:|:---:| | ||
|keyword|`str`|Keyword of the tweet| | ||
|main_tweet|`str`|Content of the tweet| | ||
|main_likes|`int`|Number of likes of the tweet| | ||
|reply|`str`|Content of the reply| | ||
|reply_likes|`int`|Number of likes of the reply| | ||
|
||
``` | ||
search_terms = [ | ||
"COVID-19", | ||
"Vaccine", | ||
"Zoom", | ||
"Bitcoin", | ||
"Dogecoin", | ||
"NFT", | ||
"Elon Musk", | ||
"Tesla", | ||
"Amazon", | ||
"iPhone 12", | ||
"Remote work", | ||
"TikTok", | ||
"Instagram", | ||
"Facebook", | ||
"YouTube", | ||
"Netflix", | ||
"GameStop", | ||
"Super Bowl", | ||
"Olympics", | ||
"Black Lives Matter" | ||
"Ukraine", | ||
"Queen Elizabeth", | ||
"World Cup", | ||
"weather", | ||
"nba", | ||
"Macdonald", | ||
"K-pop", | ||
"music", | ||
"movie", | ||
"sport", | ||
"news", | ||
"science", | ||
] | ||
``` | ||
|
||
![](./image/reply.png) | ||
|
||
## Other program | ||
|
||
- `count.py` : Count the number of tweets and replies | ||
- `check.py` : Check json file format | ||
- `search.py` : Count the number of likes of each reply | ||
|
||
## Method | ||
|
||
Using [Snscrape](https://github.com/JustAnotherArchivist/snscrape) | ||
|
||
Install Snscrape | ||
|
||
`pip3 install snscrape` | ||
|
||
Development version | ||
|
||
`pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git` | ||
|
||
## Reference | ||
|
||
- [Web Scraping with Python – How to Scrape Data from Twitter using Tweepy and Snscrape](https://www.freecodecamp.org/news/python-web-scraping-tutorial/) | ||
- [Tweepy](https://github.com/tweepy/tweepy) | ||
- [Snscrape](https://github.com/JustAnotherArchivist/snscrape) | ||
- [Twitter Developer](https://developer.twitter.com/en) | ||
- [Twitter Crawler](./twitter/README.md) | ||
- [Ccentral weather bureau Crawler](./weather/README.md) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# Twitter-Crawler | ||
|
||
Crawling Twitter's reply and save their contents and likes | ||
|
||
## Dataset | ||
|
||
### [Famous Words Twitter Dataset](https://www.kaggle.com/datasets/jackksoncsie/twitter-dataset-keywords-likes-and-tweets) | ||
|
||
|Name|type|Description| | ||
|:---:|:---:|:---:| | ||
|keyword|`str`|Keyword of the tweet| | ||
|likes|`int`|Number of likes| | ||
|tweet|`str`|Content of the tweet| | ||
|
||
## Usage | ||
|
||
Top 20 keywords in 2021, each keyword has `5000` tweets | ||
``` | ||
"COVID-19", | ||
"Vaccine", | ||
"Zoom", | ||
"Bitcoin", | ||
"Dogecoin", | ||
"NFT", | ||
"Elon Musk", | ||
"Tesla", | ||
"Amazon", | ||
"iPhone 12", | ||
"Remote work", | ||
"TikTok", | ||
"Instagram", | ||
"Facebook", | ||
"YouTube", | ||
"Netflix", | ||
"GameStop", | ||
"Super Bowl", | ||
"Olympics", | ||
"Black Lives Matter" | ||
"India vs England", | ||
"Ukraine", | ||
"Queen Elizabeth", | ||
"World Cup", | ||
"Jeffrey Dahmer", | ||
"Johnny Depp", | ||
"Will Smith", | ||
"Weather", | ||
"xvideo", | ||
"porn", | ||
"nba", | ||
"Macdonald", | ||
``` | ||
|
||
![](./image/tweet.png) | ||
|
||
### [Famous Words Twitter Reply Dataset](https://www.kaggle.com/datasets/jackksoncsie/famous-keyword-twitter-replies-dataset?rvi=1) | ||
|
||
|Name|type|Description| | ||
|:---:|:---:|:---:| | ||
|keyword|`str`|Keyword of the tweet| | ||
|main_tweet|`str`|Content of the tweet| | ||
|main_likes|`int`|Number of likes of the tweet| | ||
|reply|`str`|Content of the reply| | ||
|reply_likes|`int`|Number of likes of the reply| | ||
|
||
``` | ||
search_terms = [ | ||
"COVID-19", | ||
"Vaccine", | ||
"Zoom", | ||
"Bitcoin", | ||
"Dogecoin", | ||
"NFT", | ||
"Elon Musk", | ||
"Tesla", | ||
"Amazon", | ||
"iPhone 12", | ||
"Remote work", | ||
"TikTok", | ||
"Instagram", | ||
"Facebook", | ||
"YouTube", | ||
"Netflix", | ||
"GameStop", | ||
"Super Bowl", | ||
"Olympics", | ||
"Black Lives Matter" | ||
"Ukraine", | ||
"Queen Elizabeth", | ||
"World Cup", | ||
"weather", | ||
"nba", | ||
"Macdonald", | ||
"K-pop", | ||
"music", | ||
"movie", | ||
"sport", | ||
"news", | ||
"science", | ||
] | ||
``` | ||
|
||
![](./image/reply.png) | ||
|
||
## Other program | ||
|
||
- `count.py` : Count the number of tweets and replies | ||
- `check.py` : Check json file format | ||
- `search.py` : Count the number of likes of each reply | ||
|
||
## Method | ||
|
||
Using [Snscrape](https://github.com/JustAnotherArchivist/snscrape) | ||
|
||
Install Snscrape | ||
|
||
`pip3 install snscrape` | ||
|
||
Development version | ||
|
||
`pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git` | ||
|
||
## Reference | ||
|
||
- [Web Scraping with Python – How to Scrape Data from Twitter using Tweepy and Snscrape](https://www.freecodecamp.org/news/python-web-scraping-tutorial/) | ||
- [Tweepy](https://github.com/tweepy/tweepy) | ||
- [Snscrape](https://github.com/JustAnotherArchivist/snscrape) | ||
- [Twitter Developer](https://developer.twitter.com/en) |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Centeral Weather Bureau Crawler | ||
|
||
Main website: [中央氣象局](https://www.cwb.gov.tw/V8/C/W/week.html) | ||
The website which I crawled: [Monthly Data](https://www.cwb.gov.tw/V8/E/C/Statistics/monthlydata.html) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# I want to crawl a website https://www.cwb.gov.tw/V8/E/C/Statistics/monthlydata.html | ||
# It has one select year and one select month | ||
# The main data comes from div class="col-md-12" | ||
# crawl the data and save it to a csv file | ||
# column will be: Date, Station, Average Temperature, Precipitation, Humidity, Wind Speed, Sunshine Duration, Mean Pressure | ||
# Date format: YYYY-MM, from 2010-01 to 2020-12 | ||
|
||
import requests | ||
from selenium import webdriver | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
import time | ||
import random | ||
import csv | ||
|
||
# set the year and month | ||
def set_year_month(year, month): | ||
# set the year | ||
year_select = driver.find_element_by_id("year") | ||
year_options = year_select.find_elements_by_tag_name("option") | ||
for option in year_options: | ||
if option.text == year: | ||
option.click() | ||
break | ||
# set the month | ||
month_select = driver.find_element_by_id("month") | ||
month_options = month_select.find_elements_by_tag_name("option") | ||
for option in month_options: | ||
if option.text == month: | ||
option.click() | ||
break | ||
# click the search button | ||
search_button = driver.find_element_by_id("button") | ||
search_button.click() | ||
# wait for the page to load | ||
time.sleep(random.randint(1, 3)) | ||
|
||
# get first row data | ||
def get_first_row_data(): | ||
# get the first row data | ||
first_row = driver.find_element_by_xpath("//div[@class='col-md-12']/table/tbody/tr[1]") | ||
first_row_data = first_row.find_elements_by_tag_name("td") | ||
# get the date | ||
date = first_row_data[0].text | ||
# get the station | ||
station = first_row_data[1].text | ||
# get the average temperature | ||
average_temperature = first_row_data[2].text | ||
# get the precipitation | ||
precipitation = first_row_data[3].text | ||
# get the humidity | ||
humidity = first_row_data[4].text | ||
# get the wind speed | ||
wind_speed = first_row_data[5].text | ||
# get the sunshine duration | ||
sunshine_duration = first_row_data[6].text | ||
# get the mean pressure | ||
mean_pressure = first_row_data[7].text | ||
# return the data | ||
return date, station, average_temperature, precipitation, humidity, wind_speed, sunshine_duration, mean_pressure | ||
|
||
if __name__ == "__main__": | ||
# set the url | ||
url = "https://www.cwb.gov.tw/V8/E/C/Statistics/monthlydata.html" | ||
# set the driver | ||
driver = webdriver.Chrome() | ||
driver.get(url) | ||
# set the year and month | ||
year = "2010" | ||
month = "01" | ||
set_year_month(year, month) | ||
# get the first row data | ||
date, station, average_temperature, precipitation, humidity, wind_speed, sunshine_duration, mean_pressure = get_first_row_data() | ||
# set the data | ||
data = [date, station, average_temperature, precipitation, humidity, wind_speed, sunshine_duration, mean_pressure] | ||
# set the csv file | ||
csv_file = open("weather.csv", "w", newline="") | ||
writer = csv.writer(csv_file) | ||
writer.writerow(["Date", "Station", "Average Temperature", "Precipitation", "Humidity", "Wind Speed", "Sunshine Duration", "Mean Pressure"]) | ||
writer.writerow(data) | ||
csv_file.close() | ||
# close the driver | ||
driver.close() |