-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
145 lines (97 loc) · 3.49 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Various general helpers functions.
"""
import os
def add_trailing_slash(path):
"""
Function that adds a trailing slash to a string (likely a path).
Parameters
path: path to which the trailing slash will be added
Returns
path: path with the trailing slash added
"""
if path[-1] != '/':
path += '/'
return path
def remove_trailing_slash(path):
"""
Function that removes a trailing slash to a string (likely a path).
Parameters
path: path to wahich the trailing slash will be removed
Returns
path: path with the trailing slash removed
"""
if path[-1] == '/':
path = path[:-1]
return path
def make_dir(path):
"""
Function that creates a directory given a path.
Parameters
path: path of the directory to create
"""
if not os.path.exists(path):
try:
os.makedirs(path)
except OSError:
print path + ' already existing, skipping creation...'
def points_to_substack(points_matrix, name):
"""
Function that groups point by substack
Parameters
points_matrix: numpy array of shape (n_samples, n_dimensions) representing the dataset
name: numpy array of shape (n_samples, ) representing the IDs of the points;
should be something like "MS_CENTER ??(??????)"
Returns
data_substacks: dictionary indexed with substacks IDs holding dataset matrices of each substack
"""
if points_matrix.shape[0] != len(name):
raise RuntimeError("Number of points and their names should have the same length!")
data_substacks = dict()
for index in xrange(len(name)):
substack = extract_substack(name[index])
if substack not in data_substacks:
data_substacks[substack] = list()
data_substacks[substack].append(points_matrix[index, :])
return data_substacks
def data_to_substack(data_matrix, substacks):
"""
Function that extracts and groups data by given substacks
Parameters
data_matrix: numpy array of shape (n_samples, n_data_dimensions) representing the data,
most likely a pandas.DataFrame.as_matrix(); last column must be
a substack column
substacks: set of substacks for which the function should extract and group points
Returns
data_substacks: dictionary indexed with substacks IDs holding data matrices of each substack
"""
data_substacks = dict()
for index in xrange(data_matrix.shape[0]):
substack = extract_substack(data_matrix[index, -1])
if substack in substacks:
if substack not in data_substacks:
data_substacks[substack] = list()
data_substacks[substack].append(data_matrix[index, :-1])
return data_substacks
def extract_substack(name):
"""
Function that extracts substack ID given a name like "MS_CENTER ??(??????)"
Parameters
name: a string which is an ID of a point, like "MS_CENTER ??(??????)"
Returns
substack: a string which is the substack ID extracted from name
"""
temp = name.split(' ')
temp = temp[1].split('(')
substack = temp[1][:-1]
return substack
def get_filenames(path):
"""
Function that extracts all the files in a specified path.
Parameters
path: path in which lists the files
Returns
filenames: list with all the filenames in the given path
"""
filenames = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
return filenames