-
Notifications
You must be signed in to change notification settings - Fork 0
/
GQ_utils.py
286 lines (220 loc) · 9.07 KB
/
GQ_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# -*- coding: utf-8 -*-
"""
Created on August 2024
@author: Magued Al-Aghbary & Mohamed Sobh
"""
# Helper functions
import pandas as pd
from IPython.display import display
import numpy as np
from scipy.spatial import cKDTree as KDTree
import pyproj as proj
from pyproj import Transformer
from scipy import stats, interpolate
milli= 0.001
def table_grid_search(clf, all_columns=False, all_ranks=False, save=True):
# Convert the cross validated results in a DataFrame ordered by `rank_test_score` and `mean_fit_time`.
# As it is frequent to have more than one combination with the same max score,
# the one with the least mean fit time SHALL appear first.
cv_results = pd.DataFrame(clf.cv_results_).sort_values(by=['rank_test_score', 'mean_fit_time'])
# Reorder
columns = cv_results.columns.tolist()
# rank_test_score first, mean_test_score second and std_test_score third
columns = columns[-1:] + columns[-3:-1] + columns[:-3]
cv_results = cv_results[columns]
if save:
cv_results.to_csv('--'.join(cv_results['params'][0].keys()) + '.csv', index=True, index_label='Id')
# Unless all_columns are True, drop not wanted columns: params, std_* split*
if not all_columns:
cv_results.drop('params', axis='columns', inplace=True)
cv_results.drop(list(cv_results.filter(regex='^std_.*')), axis='columns', inplace=True)
cv_results.drop(list(cv_results.filter(regex='^split.*')), axis='columns', inplace=True)
# Unless all_ranks are True, filter out those rows which have rank equal to one
if not all_ranks:
cv_results = cv_results[cv_results['rank_test_score'] == 1]
cv_results.drop('rank_test_score', axis = 'columns', inplace = True)
cv_results = cv_results.style.hide_index()
display(cv_results)
# Haversine arc distance
def distance(lat1, lon1, lat2, lon2):
'''
Haversine formula returns distance between pairs of coordinates.
coordinates as numpy arrays, lists or real
The haversine formula determines the great-circle distance between
two points on a sphere given their longitudes and latitudes
'''
p = 0.017453292519943295 # pi/180
a = 0.5 - np.cos((lat2-lat1)*p)/2 + np.cos(lat1*p)*np.cos(lat2*p) * (1-np.cos((lon2-lon1)*p)) / 2
return 12742.0176 * np.arcsin(np.sqrt(a)) # returns in km
#function to make inverse distance weight interpolation
N = 10000
Ndim = 2
Nask = N # N Nask 1e5: 24 sec 2d, 27 sec 3d on mac g4 ppc
Nnear = 8 # 8 2d, 11 3d => 5 % chance one-sided -- Wendel, mathoverflow.com
leafsize = 10
eps = .1 # approximate nearest, dist <= (1 + eps) * true nearest
p = 1 # weights ~ 1 / distance**p
cycle = .25
seed = 1
class Invdisttree:
def __init__( self, X, z, leafsize=10, stat=0 ):
assert len(X) == len(z), "len(X) %d != len(z) %d" % (len(X), len(z))
self.tree = KDTree( X, leafsize=leafsize ) # build the tree
self.z = z
self.stat = stat
self.wn = 0
self.wsum = None;
def __call__( self, q, nnear=6, eps=0, p=1, weights=None ):
# nnear nearest neighbours of each query point --
q = np.asarray(q)
qdim = q.ndim
if qdim == 1:
q = np.array([q])
if self.wsum is None:
self.wsum = np.zeros(nnear)
self.distances, self.ix = self.tree.query( q, k=nnear, eps=eps )
interpol = np.zeros( (len(self.distances),) + np.shape(self.z[0]) )
jinterpol = 0
for dist, ix in zip( self.distances, self.ix ):
if nnear == 1:
wz = self.z[ix]
elif dist[0] < 1e-10:
wz = self.z[ix[0]]
else: # weight z s by 1/dist --
w = 1 / dist**p
if weights is not None:
w *= weights[ix] # >= 0
w /= np.sum(w)
wz = np.dot( w, self.z[ix] )
if self.stat:
self.wn += 1
self.wsum += w
interpol[jinterpol] = wz
jinterpol += 1
return interpol if qdim > 1 else interpol[0]
# adapted from a script agrid https://github.com/TobbeTripitaka/agrid
# paper can be accssed from https://www.doi.org/10.5334/JORS.287
verbose = True
def _check_if_in(xx, yy, margin=2):
'''Generate an array of the condition that coordinates
are within the model or not.
xx = list or array of x values
yy = list or array of y values
margin = extra cells added to mitigate extrapolation of
interpolated values along the frame
returns boolean array True for points within the frame.
'''
res = [xx.max()- xx.min(), yy.max(), yy.min() ]
x_min = xx.min() - margin * res[0]
x_max = xx.max() + margin * res[0]
y_min = yy.min() - margin * res[1]
y_max = yy.max() + margin * res[1]
return (xx > x_min) & (xx < x_max) & (yy > y_min) & (yy < y_max)
def _set_meridian( x_array, center_at_0=True):
'''
Sloppy function to change longitude values from [0..360] to [-180..180]
x_array : Numpy array with longitude values (X)
center_at_0 : Bool select direction of conversion.
lon=(((lon + 180) % 360) - 180)
'''
if center_at_0:
x_array[x_array > 180] = x_array[x_array > 180] - 360
else:
x_array[x_array < 0] = x_array[x_array < 0] + 360
return x_array
def read_numpy( i = 0,
j = 1,
k = 2,
data = None,
ds=None,
interpol='linear',
crs_from=None,
crs_to=None,
use_dask=None,
dask_chunks=None,
pad_around=False,
only_frame=True,
set_center=False,
verbose=False,
z_factor=1,
**kwargs):
'''Read numpy array and interpolate to grid.
Keyword arguments:
x,y,z numpy arrays of same size, eg, A[0,:], A[1,:], A[2,:]
Returns numpy array
kwargs to interpolation
'''
if data is not None:
x = data[:,i]
y = data[:,j]
z = data[:,k]
assert(np.shape(x)==np.shape(y)==np.shape(z)), 'x, y, and z must have the same shape.'
if crs_from is None:
crs_from = crs_from
if crs_to is None:
crs_to = crs_to
if verbose:
print('Shape:', np.shape(x))
if z_factor is not 1:
z *= z_factor
# Set longitude, case from 0 to -360 insetad of -180 to 180
if set_center:
x = _set_meridian(x)
transformer = Transformer.from_crs(crs_from, crs_to)
xv, yv = transformer.transform(x, y)
#xv, yv = proj.transform(proj.Proj(crs_src),
# proj.Proj(crs), x, y)
n = z.size
zi = np.reshape(z, (n))
xi = np.reshape(xv, (n))
yi = np.reshape(yv, (n))
# Check and interpolate only elements in the frame
if only_frame:
is_in = _check_if_in(xi, yi)
xi = xi[is_in]
yi = yi[is_in]
zi = zi[is_in]
ny = len(ds.Y)
nx = len(ds.X)
nn = (ny, nx)
if interpol == "IDW":
X = np.array([xv, yv]).T
coords = np.stack([ ds.coords['XV'].values.flatten() ,
ds.coords['YV'].values.flatten() ], axis=1)
invdisttree = Invdisttree( X,
z.reshape(-1,1), leafsize=leafsize, stat=1 )
arr = invdisttree( coords,
nnear=Nnear, eps=eps, p=p )
arr = arr.ravel().reshape(nn)
else:
arr = interpolate.griddata((xi, yi),
zi,
(ds.coords['XV'],
ds.coords['YV']),
method=interpol,
**kwargs)
if pad_around:
for i in range(ep_max)[::-1]:
arr[:, i][np.isnan(arr[:, i])] = arr[
:, i + 1][np.isnan(arr[:, i])]
arr[:, -i][np.isnan(arr[:, -i])] = arr[:, -
i - 1][np.isnan(arr[:, -i])]
arr[i, :][np.isnan(arr[i, :])] = arr[
i + 1, :][np.isnan(arr[i, :])]
arr[-i, :][np.isnan(arr[-i, :])] = arr[-i -
1, :][np.isnan(arr[-i, :])]
if use_dask:
if dask_chunks is None:
dask_chunks = (nx // chunk_n,) * arr.ndim
arr = da.from_array(arr, chunks=dask_chunks)
return arr
# object of pyproj and inistilize it with WGS84 model
geod = proj.Geod(ellps='WGS84' )
km = 1000
# this function get the distance in km and avoid uncessary unpaking
def geodinv(lon1, lat1, lon2, lat2):
_, _, ds_ = geod.inv(lon1, lat1, lon2, lat2)
return ds_/km
# Define function to correct magnetic values
def mag_log(data, C=400, clip_min=-1, clip_max=1):
return np.clip(np.sign(data)*np.log(1+np.abs(data)/C), clip_min, clip_max)