Rewritten model

cqfn · Jun 15, 2020 · 11e49e5 · 11e49e5
1 parent 8ec3e6d
commit 11e49e5
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 62 deletions.
diff --git a/aibolit/__main__.py b/aibolit/__main__.py
@@ -66,13 +66,12 @@ def list_dir(path, files):
 
 def predict(input_params, model, args):
     features_order = model.features_conf['features_order']
-    # load model
-    input = [input_params[i] for i in features_order]
+    # add ncss to last column. We will normalize all patterns by that value
+    input = [input_params[i] for i in features_order] + [input_params['M2']]
     th = float(args.threshold) or 1.0
-    # preds, importances = model.predict(np.array(input))
-    preds, importances = model.informative(np.array(input))
+    preds, importances = model.informative(np.array(input), th=th)
 
-    return {features_order[int(x)]: int(x) for x in preds}, importances
+    return {features_order[int(x)]: int(x) for x in preds[0]}, importances
 
 
 def run_parse_args(commands_dict):
@@ -217,7 +216,7 @@ def inference(
                 code_lines = code_lines_dict.get('lines_' + key)
                 importance = importances[iter]
                 # We show only patterns with positive importance
-                if code_lines and importance > 0.00000e-20:
+                if code_lines and importance > 0:
                     if code_lines:
                         pattern_name = \
                             [x['name'] for x in Config.get_patterns_config()['patterns']

diff --git a/aibolit/model/model.py b/aibolit/model/model.py
@@ -1,3 +1,4 @@
+from decimal import localcontext, ROUND_DOWN, Decimal
 from typing import List
 
 import numpy as np
@@ -98,11 +99,13 @@ def fit(self, X, y, display=False):
         self.model = model
         self.model.fit(X, y.ravel(), logging_level='Silent')
 
-    def __get_pairs(self, item, th: float):
+    def __get_pairs(self, item, th: float, feature_importances=None):
         def sigmoid(x):
             return 1 / (1 + np.exp(-x))
 
-        pattern_importances = item * self.model.feature_importances_
+        if not feature_importances:
+            feature_importances = self.model.feature_importances_
+        pattern_importances = item * feature_importances
         # mask discards not significant patterns
         th_mask = (sigmoid(pattern_importances) <= th) + 0
         pattern_importances = pattern_importances * th_mask
@@ -112,47 +115,7 @@ def sigmoid(x):
     def __vstack_arrays(self, res):
         return np.vstack(res).T
 
-    def predict1(self, X, return_acts=False, quantity_func='log', th=1.0):
-        """
-        Args:
-            X: np.array with shape (number of snippets, number of patterns) or
-                (number of patterns, ).
-            quantity_func: str, type of function that will be applied to
-                number of occurrences.
-            th (float): Sensitivity of algorithm to recommend.
-                0 - ignore all recomendations
-                1 - use all recommendations
-        Returns:
-            ranked: np.array with shape (number of snippets, number of patterns)
-                of sorted patterns in non-increasing order for each snippet of
-                code.
-        """
-
-        if X.ndim == 1:
-            X = X.copy()
-            X = np.expand_dims(X, axis=0)
-
-        ranked = []
-        quantity_funcs = {
-            'log': lambda x: np.log1p(x) / np.log(10),
-            'exp': lambda x: np.exp(x + 1),
-            'linear': lambda x: x,
-        }
-
-        for snippet in X:
-            try:
-                item = quantity_funcs[quantity_func](snippet)
-                pairs = self.__vstack_arrays(self.__get_pairs(item, th))
-                pairs = pairs[pairs[:, 0].argsort()]
-                ranked.append(pairs[:, 1].T.tolist()[::-1])
-            except Exception:
-                raise Exception("Unknown func")
-
-        if not return_acts:
-            return (np.array(ranked), pairs[:, 0].T.tolist()[::-1])
-        return np.array(ranked), pairs[:, 0].T.tolist()[::-1], np.zeros(X.shape[0]) - 1
-
-    def predict(self, X, quantity_func='log', th=1.0):
+    def calculate_score(self, X, quantity_func='log', th=1.0, feature_importances=None):
         """
         Args:
             X: np.array with shape (number of snippets, number of patterns) or
@@ -185,15 +148,16 @@ def predict(self, X, quantity_func='log', th=1.0):
         for snippet in X:
             try:
                 item = quantity_funcs[quantity_func](snippet)
-                pairs = self.__vstack_arrays(self.__get_pairs(item, th))
+                pairs = self.__vstack_arrays(self.__get_pairs(item, th, feature_importances))
                 pairs = pairs[pairs[:, 0].argsort()]
                 ranked.append(pairs[:, 1].T.tolist()[::-1])
-            except Exception:
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
                 raise Exception("Unknown func")
 
         return (np.array(ranked), pairs[:, 0].T.tolist()[::-1])
 
-
     def get_array(self, X, mask, i, incr):
         """
         Args:
@@ -206,7 +170,7 @@ def get_array(self, X, mask, i, incr):
         """
 
         X1 = X.copy()
-        X1[:, i][mask[:, i]] += incr
+        X1[:, i][mask[:, i]] += incr[mask[:, i]]
 
         return X1
 
@@ -225,7 +189,7 @@ def get_minimum(self, c1, c2, c3):
 
         return np.min(c, 0), np.argmin(c, 0)
 
-    def informative(self, X):
+    def informative(self, X, scale=True, th=1.0):
         """
         Args:
             X: np.array with shape (number of snippets, number of patterns) or
@@ -240,17 +204,40 @@ def informative(self, X):
         """
         X = X.copy()
         X = np.expand_dims(X, axis=0)
+        # remember it, since we will use `log` function for non-normalized input value
+        X_old = X[:, :-1]
+
+        ncss = X[:, -1]
+        if scale:
+            X = X[:, :-1] / ncss.reshape(-1, 1)
+        else:
+            X = X[:, :-1]
 
         k = X.shape[1]
         complexity = self.model.predict(X)
         mask = X > 0
         importances = []
+        print(f' complexity: {complexity}')
         for i in range(k):
-            complexity_minus = self.model.predict(self.get_array(X, mask, i, -1))
-            diff = complexity - complexity_minus
-            if i == 11:
-                print()
-            importances.append((i, diff[0]))
-
-        sorted_arr = sorted(importances, key=lambda x: x[1], reverse=True)
-        return [x[0] for x in sorted_arr], [x[1] for x in sorted_arr]
+            if X[0][i] == 0:
+                # do not need to predict if we have 0
+                importances.append((i, 0))
+                continue
+            complexity_minus = self.model.predict(self.get_array(X, mask, i, -1 / ncss))
+            if complexity_minus < complexity:
+                # complexity decreased
+                with localcontext() as ctx:
+                    ctx.rounding = ROUND_DOWN
+                    abs_diff = abs(complexity - complexity_minus)[0]
+                    diff = Decimal(abs_diff).quantize(Decimal('0.001'))
+                    diff = float(diff * 100)
+            elif complexity_minus > complexity:
+                # complexity increased, we do not need such value, set to 0,
+                # cause we need only patterns when complexity decreased
+                diff = 0
+            else:
+                diff = 0
+            importances.append((i, diff))
+
+        only_importances = list(np.array(importances).T[1])
+        return self.calculate_score(X_old[0], th=th, feature_importances=only_importances)