fossology · its-sushant · Jul 13, 2022
diff --git a/README.md b/README.md
@@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
 - Running **wordFrequencySimilarity** agent
 
     `atarashi -a wordFrequencySimilarity /path/to/file.c`
+- Running **bm25** agent
+
+    `atarashi -a bm25 /path/to/file.c`
 - Running **tfidf** agent
     - With **Cosine similarity**
 

diff --git a/atarashi/agents/bm25.py b/atarashi/agents/bm25.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2022 Sushant kumar (sushantmishra02102002@gmail.com)
+
+SPDX-License-Identifier: GPL-2.0
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+version 2 as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+import argparse
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+
+from atarashi.agents.atarashiAgent import AtarashiAgent
+from atarashi.libs.initialmatch import spdx_identifer
+
+__author__ = "Sushant Kumar"
+__email__ = "sushantmishra02102002@gmail.com"
+
+
+class Bm25(AtarashiAgent):
+    def __init__(self, licenseList, b=0.59, k1=1.6):
+        super().__init__(licenseList)
+        self.vectorizer = TfidfVectorizer(smooth_idf=False)
+        self.b = b
+        self.k1 = k1
+
+    def fit_transform(self, processedData, corpus):
+        '''
+        This function performs the OkapiBM25 tranformation of both
+        processedData and LicenseList using sklearn's TFidfVectorizer.
+
+        Reference: https://en.wikipedia.org/wiki/Okapi_BM25/
+
+        :param processedData: Preprocessed input file
+        :param corpus: List of licenses from licenseList
+        '''
+        b, k1 = self.b, self.k1
+
+        # transforming text into vector
+        self.vectorizer.fit(corpus)
+        corpus = super(TfidfVectorizer, self.vectorizer).transform(corpus)
+
+        avdl = corpus.sum(1).mean()
+        len_X = corpus.sum(1).A1
+
+        # transforming processeddata into vector for similarity calc.
+        processedData = super(
+            TfidfVectorizer, self.vectorizer).transform([processedData])
+
+        corpus = corpus.tocsc()[:, processedData.indices]
+        self.divisor = corpus + (k1 * (1 - b + b * len_X / avdl))[:, None]
+
+        idf = self.vectorizer._tfidf.idf_[None, processedData.indices]
+        self.dividend = corpus.multiply(
+            np.broadcast_to(idf-1, corpus.shape)) * (k1 + 1)
+
+    def scores(self):
+        return (self.dividend / self.divisor).sum(1).A1
+
+    def scan(self, filePath):
+        '''
+        Read the content of filename, extract the comments and preprocess them.
+        Find the license of the preprocessed file.
+
+        :param filePath: Path of the file to scan
+        :return: Returns the license's short name with highest similarity scores
+        '''
+        processedData = super().loadFile(filePath)
+
+        with open(filePath) as file:
+            raw_data = file.read()
+        spdx_identifers = spdx_identifer(raw_data,
+                                         self.licenseList['shortname'])
+
+        match = []
+        if spdx_identifers:
+            match.extend(spdx_identifers)
+        else:
+            corpus = []
+            corpus_identifier = []
+            for idx in range(len(self.licenseList)):
+                tok = self.licenseList.iloc[idx]['processed_text']
+                corpus.append(tok)
+                tok_identifier = self.licenseList.iloc[idx]['shortname']
+                corpus_identifier.append(tok_identifier)
+
+            self.fit_transform(
+                processedData, corpus)
+            doc_scores = self.scores()
+            indices = np.argsort(doc_scores)[::-1][:5]
+
+            for index in indices:
+                match.append({
+                    "shortname": str(corpus_identifier[index]),
+                    "sim_score": doc_scores[index],
+                    "sim_type": "bm25",
+                    "description": ""
+                })
+
+        return match
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "inputFile", help="Specify the input file which needs to be scanned")
+    parser.add_argument("processedLicenseList",
+                        help="Specify the processed license list file which contains licenses")
+    args = parser.parse_args()
+    filename = args.inputFile
+    licenseList = args.processedLicenseList
+    verbose = args.verbose
+
+    scanner = Bm25(licenseList)
diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py
@@ -28,6 +28,7 @@
 from atarashi.agents.dameruLevenDist import DameruLevenDist
 from atarashi.agents.tfidf import TFIDF
 from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
+from atarashi.agents.bm25 import Bm25
 
 __author__ = "Aman Jain"
 __email__ = "amanjain5221@gmail.com"
@@ -78,6 +79,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
     scanner = WordFrequencySimilarity(processedLicense)
   elif agent_name == "DLD":
     scanner = DameruLevenDist(processedLicense)
+  elif agent_name == "bm25":
+    scanner = Bm25(processedLicense)
   elif agent_name == "tfidf":
     scanner = TFIDF(processedLicense)
     if similarity == "CosineSim":
@@ -128,7 +131,7 @@ def main():
   parser.add_argument("-l", "--processedLicenseList", required=False,
                       help="Specify the location of processed license list file")
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'bm25'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],

diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py
@@ -118,7 +118,7 @@ def evaluate(scanner):
   defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
   parser = argparse.ArgumentParser()
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'bm25'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],

diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,4 @@ spacy>=2.0.11
 textdistance>=3.0.3
 setuptools>=39.2.0
 nirjas>=0.0.5
-urllib3>=1.24.1
+urllib3>=1.24.1