-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-datasets.py
124 lines (92 loc) · 4.05 KB
/
get-datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import requests
import pandas as pd
import tqdm
import os
def get_infojobs_datasets(api_key, path):
"""
Get the jobs dataset from InfoJobs API and save it to a CSV file.
Parameters
----------
api_key : str
The API key to access the InfoJobs API.
path : str, optional
The path to save the CSV files. The defaults are 'data/jobs_dataset.csv' and 'data/country_dataset.csv'.
Returns
-------
None. Saves the datasets to CSV files.
"""
# Set the Authentification Header for InfoJobs API
headers = {
'Authorization': api_key
}
# Create the directory if it doesn't exist
if not os.path.exists(path):
os.makedirs(path)
# Create the paths to the CSV files
jobs_dataset_path = os.path.join(path, 'jobs_dataset.csv')
province_dataset_path = os.path.join(path, 'province_dataset.csv')
country_dataset_path = os.path.join(path, 'country_dataset.csv')
# Check if the datasets are already cached
try:
df = pd.read_csv(jobs_dataset_path)
print('\nThe jobs dataset is already cached')
except FileNotFoundError:
print('\nGetting the jobs dataset from InfoJobs API...')
# Get the first page of the dataset
r = requests.get('https://api.infojobs.net/api/9/offer', headers=headers, params={'page': 1, 'maxResults': '50'})
# Convert the response to JSON
first_json = r.json()
# Create a list with all the data
all_data = first_json['offers']
# Set up the progress bar
progress_bar = tqdm.tqdm(total=first_json['totalResults'], unit='offers')
for page in range(1, first_json['totalResults']//50 + 2):
# Get the dataset from InfoJobs API
r = requests.get('https://api.infojobs.net/api/9/offer', headers=headers, params={'page': page, 'maxResults': '50', 'country': 'espana'})
# Convert the response to JSON
json = r.json()
try:
# Add the data to the dictionary
all_data += json['offers']
except KeyError:
page -= 1
continue
# Update the progress bar
progress_bar.update(len(json['offers']))
# Close the progress bar
progress_bar.close()
# Create a DataFrame with all the data
df = pd.DataFrame(all_data)
# Save the DataFrame to a CSV file
df.to_csv(jobs_dataset_path, index=False)
print(f'The dataset has been saved to {jobs_dataset_path}\n')
try:
df = pd.read_csv(country_dataset_path)
print('\nThe country dataset is already cached')
except FileNotFoundError:
print('\nGetting the country dataset from InfoJobs API...')
r = requests.get('https://api.infojobs.net/api/1/dictionary/country', headers=headers)
# Convert the response to JSON and create a the dataset
df = pd.DataFrame(r.json())
# Save the dataset to a CSV file
df.to_csv(country_dataset_path, index=False)
print(f'The dataset has been saved to {country_dataset_path}\n')
try:
df = pd.read_csv(province_dataset_path)
print('\nThe province dataset is already cached')
except FileNotFoundError:
print('\nGetting the province dataset from InfoJobs API...')
r = requests.get('https://api.infojobs.net/api/1/dictionary/province', headers=headers)
# Convert the response to JSON and create a the dataset
df = pd.DataFrame(r.json())
# Save the dataset to a CSV file
df.to_csv(province_dataset_path, index=False)
print(f'The dataset has been saved to {province_dataset_path}\n')
if __name__ == '__main__':
# Get root directory of the project
base_dir = os.path.dirname(os.path.abspath(__file__))
# Get the private API key
with open(os.path.join(base_dir, 'api-key.txt'), 'r') as file:
api_key = file.read()
# Get the jobs dataset from InfoJobs API and save it to a CSV file
get_infojobs_datasets(api_key, path=os.path.join(base_dir, 'data/'))