Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(agent):Add okapibm25 agent #101

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
- Running **wordFrequencySimilarity** agent

`atarashi -a wordFrequencySimilarity /path/to/file.c`
- Running **bm25** agent

`atarashi -a bm25 /path/to/file.c`
- Running **tfidf** agent
- With **Cosine similarity**

Expand Down
128 changes: 128 additions & 0 deletions atarashi/agents/bm25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Copyright 2022 Sushant kumar (sushantmishra02102002@gmail.com)

SPDX-License-Identifier: GPL-2.0

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""

import argparse

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

from atarashi.agents.atarashiAgent import AtarashiAgent
from atarashi.libs.initialmatch import spdx_identifer

__author__ = "Sushant Kumar"
__email__ = "sushantmishra02102002@gmail.com"


class Bm25(AtarashiAgent):
def __init__(self, licenseList, b=0.59, k1=1.6):
super().__init__(licenseList)
self.vectorizer = TfidfVectorizer(smooth_idf=False)
self.b = b
self.k1 = k1

def fit_transform(self, processedData, corpus):
'''
This function performs the OkapiBM25 tranformation of both
processedData and LicenseList using sklearn's TFidfVectorizer.

Reference: https://en.wikipedia.org/wiki/Okapi_BM25/

:param processedData: Preprocessed input file
:param corpus: List of licenses from licenseList
'''
b, k1 = self.b, self.k1

# transforming text into vector
self.vectorizer.fit(corpus)
corpus = super(TfidfVectorizer, self.vectorizer).transform(corpus)

avdl = corpus.sum(1).mean()
len_X = corpus.sum(1).A1

# transforming processeddata into vector for similarity calc.
processedData = super(
TfidfVectorizer, self.vectorizer).transform([processedData])

corpus = corpus.tocsc()[:, processedData.indices]
self.divisor = corpus + (k1 * (1 - b + b * len_X / avdl))[:, None]

idf = self.vectorizer._tfidf.idf_[None, processedData.indices]
self.dividend = corpus.multiply(
np.broadcast_to(idf-1, corpus.shape)) * (k1 + 1)

def scores(self):
return (self.dividend / self.divisor).sum(1).A1

def scan(self, filePath):
'''
Read the content of filename, extract the comments and preprocess them.
Find the license of the preprocessed file.

:param filePath: Path of the file to scan
:return: Returns the license's short name with highest similarity scores
'''
processedData = super().loadFile(filePath)

with open(filePath) as file:
raw_data = file.read()
spdx_identifers = spdx_identifer(raw_data,
self.licenseList['shortname'])

match = []
if spdx_identifers:
match.extend(spdx_identifers)
else:
corpus = []
corpus_identifier = []
for idx in range(len(self.licenseList)):
tok = self.licenseList.iloc[idx]['processed_text']
corpus.append(tok)
tok_identifier = self.licenseList.iloc[idx]['shortname']
corpus_identifier.append(tok_identifier)

self.fit_transform(
processedData, corpus)
doc_scores = self.scores()
indices = np.argsort(doc_scores)[::-1][:5]

for index in indices:
match.append({
"shortname": str(corpus_identifier[index]),
"sim_score": doc_scores[index],
"sim_type": "bm25",
"description": ""
})

return match


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"inputFile", help="Specify the input file which needs to be scanned")
parser.add_argument("processedLicenseList",
help="Specify the processed license list file which contains licenses")
args = parser.parse_args()
filename = args.inputFile
licenseList = args.processedLicenseList
verbose = args.verbose

scanner = Bm25(licenseList)
5 changes: 4 additions & 1 deletion atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
from atarashi.agents.bm25 import Bm25

__author__ = "Aman Jain"
__email__ = "amanjain5221@gmail.com"
Expand Down Expand Up @@ -78,6 +79,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
scanner = WordFrequencySimilarity(processedLicense)
elif agent_name == "DLD":
scanner = DameruLevenDist(processedLicense)
elif agent_name == "bm25":
scanner = Bm25(processedLicense)
elif agent_name == "tfidf":
scanner = TFIDF(processedLicense)
if similarity == "CosineSim":
Expand Down Expand Up @@ -128,7 +131,7 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'bm25'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
2 changes: 1 addition & 1 deletion atarashi/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def evaluate(scanner):
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'bm25'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ spacy>=2.0.11
textdistance>=3.0.3
setuptools>=39.2.0
nirjas>=0.0.5
urllib3>=1.24.1
urllib3>=1.24.1