-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #187 from COS301-SE-2024/develop
Merge develop into main
- Loading branch information
Showing
128 changed files
with
39,498 additions
and
699 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.safetensors filter=lfs diff=lfs merge=lfs -text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
[flake8] | ||
extend-ignore = W292, W291, E302, W293, E501, E261, E262 | ||
extend-ignore = W292, W291, E302, W293, E501, E261, E262, E402 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,6 @@ name: Node.js CI | |
|
||
on: | ||
push: | ||
pull_request: | ||
|
||
jobs: | ||
build: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,4 +5,8 @@ obj/ | |
**/__pycache__/ | ||
venv/ | ||
myenv/ | ||
|
||
**/results/ | ||
__pycache__/ | ||
.DS_Store | ||
**/.DS_Store | ||
env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"python.testing.unittestArgs": [ | ||
"-v", | ||
"-s", | ||
"./backend", | ||
"-p", | ||
"*test*.py" | ||
], | ||
"python.testing.pytestEnabled": false, | ||
"python.testing.unittestEnabled": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
trained_model/* filter=lfs diff=lfs merge=lfs -text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import os | ||
from transformers import BertTokenizer, BertForSequenceClassification, pipeline | ||
|
||
# base_dir = os.path.dirname(__file__) | ||
# model_path = os.path.join(base_dir, 'trained_model') | ||
# tokenizer_path = os.path.join(base_dir,'ca_tokenizer') | ||
|
||
# tokenizer_ = AutoTokenizer.from_pretrained("rdhinaz/gdpr_consent_agreement") | ||
# mode+l = AutoModelForSequenceClassification.from_pretrained("rdhinaz/gdpr_consent_agreement") | ||
|
||
class CA: | ||
|
||
def __init__(self): | ||
# self.model_ = model | ||
# self.tokenizer_ = tokenizer | ||
self.max_length_ = 512 | ||
self.classifier = pipeline("text-classification", model="rdhinaz/gdpr_consent_agreement") | ||
self.max_length = self.max_length_ | ||
|
||
def run_CA(self, text): | ||
return self.predict(text) | ||
|
||
def predict(self, input_text): | ||
tokens = self.classifier.tokenizer.encode(input_text, truncation=True, max_length=self.max_length_) | ||
truncated_text = self.classifier.tokenizer.decode(tokens, skip_special_tokens=True) | ||
|
||
result = self.classifier(truncated_text) | ||
return result[0]['label'] | ||
|
||
if __name__ == '__main__': | ||
ca = CA() | ||
print(ca.run_CA("Sure, you can use my data")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import os | ||
from transformers import BertTokenizer, BertForSequenceClassification, pipeline | ||
|
||
class EM: | ||
|
||
def __init__(self): | ||
# self.model_ = model | ||
# self.tokenizer_ = tokenizer | ||
self.max_length_ = 512 | ||
self.classifier = pipeline("text-classification", model="BananaFish45/Ethnicity_model") | ||
self.max_length = self.max_length_ | ||
|
||
def run_EM(self, text): | ||
return self.predict(text) | ||
|
||
def predict(self, input_text): | ||
count = 0 | ||
tokens = self.classifier.tokenizer.encode(input_text, truncation=True, max_length=self.max_length_) | ||
truncated_text = self.classifier.tokenizer.decode(tokens, skip_special_tokens=True) | ||
|
||
result = self.classifier(truncated_text) | ||
label = result[0]['label'] | ||
if label != 'LABEL_2': | ||
count += 1 | ||
else: | ||
count += 0 | ||
|
||
return count | ||
|
||
if __name__ == '__main__': | ||
em = EM() | ||
print(em.run_EM("Sure, you can use my data"))#No ethnicity | ||
print(em.run_EM("Paul is an Indian"))#Indian | ||
print(em.run_EM("Dhinaz is an African young man very strong"))#African | ||
print(em.run_EM("Yeshlen is an Asian business man crooked man indeed"))#Asian | ||
print(em.run_EM("Yudi is a Middle Eastern Rug business man honest guy hey but loves money"))#Middle Eastern | ||
print(em.run_EM("Jonas is a school admin clerk"))#No Ethnicity | ||
print(em.run_EM("Micheal is an Caucasian programmer "))#European | ||
print(em.run_EM("Nevin is an Hispanic scammer"))#Hispanic | ||
print(em.run_EM("Samantha is an African American"))#African American | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import os | ||
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, pipeline | ||
|
||
|
||
class GDPR: | ||
|
||
def __init__(self): | ||
self.max_length_ = 512 | ||
self.classifier = pipeline("text-classification", model="BananaFish45/gdpr_personal_data") | ||
self.max_length = self.max_length_ | ||
|
||
def run_GDPR(self, text): | ||
return self.predict(text) | ||
|
||
def predict(self, input_text): | ||
count = 0 | ||
tokens = self.classifier.tokenizer.encode(input_text, truncation=True, max_length=self.max_length_) | ||
truncated_text = self.classifier.tokenizer.decode(tokens, skip_special_tokens=True) | ||
|
||
result = self.classifier(truncated_text) | ||
# return result[0]['label'] | ||
|
||
label = result[0]['label'] | ||
if label != 'LABEL_0': | ||
count += 1 | ||
else: | ||
count += 0 | ||
|
||
return count | ||
|
||
|
||
if __name__ == '__main__': | ||
gdpr = GDPR() | ||
print(gdpr.run_GDPR("Samantha Schultz lives at 5365 Julie Union, Lake Jorgemouth, PA 90372. Contact them at johnsonjames@valencia-boyer.com or 5020153600.,Samantha,Schultz,S,2020-06-20,5365 Julie Union, Lake Jorgemouth, PA 90372,johnsonjames@valencia-boyer.com,5020153600,+1-762-972-3354,042-45-5223,XD20386955,445-92-8929,950-96-6247"))#Personal Data | ||
print(gdpr.run_GDPR("Food life marriage book if business. Long beat accept tell. Support method field agency several break citizen amount."))#No personal data | ||
print(gdpr.run_GDPR("John Doe's email is john.doe@example.com and his phone number is 123-456-7890."))#Personal data | ||
print(gdpr.run_GDPR("Yeshlen Moddley is an Asian business man crooked man indeed"))#Personal data | ||
print(gdpr.run_GDPR("This document contains general information about the company's financials"))#No personal data | ||
print(gdpr.run_GDPR("Jonas Motha is a school admin clerk"))#No Ethnicity | ||
print(gdpr.run_GDPR("Micheal Simon is an Caucasian programmer "))#Personal data | ||
print(gdpr.run_GDPR("Nevin Thomas is an Hispanic scammer"))#Personal data | ||
print(gdpr.run_GDPR("Samantha Williams is an African American"))#Personal data | ||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from transformers import AutoTokenizer, AutoModelForTokenClassification | ||
from transformers import pipeline | ||
|
||
class MD: | ||
|
||
def __init__(self): | ||
tokenizer = AutoTokenizer.from_pretrained("ugaray96/biobert_ncbi_disease_ner") | ||
model = AutoModelForTokenClassification.from_pretrained( | ||
"ugaray96/biobert_ncbi_disease_ner" | ||
) | ||
|
||
self.ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) | ||
|
||
def run_MD(self, input): | ||
# result = self.ner_pipeline(input) | ||
return self.predict(input) | ||
|
||
def predict(self, input): | ||
|
||
result = self.ner_pipeline(input) | ||
|
||
diseases = [] | ||
for entity in result: | ||
if entity["entity"] == "Disease": | ||
diseases.append(entity["word"]) | ||
elif entity["entity"] == "Disease Continuation" and diseases: | ||
diseases[-1] += f" {entity['word']}" | ||
|
||
return diseases | ||
# print(f"Diseases: {', '.join(diseases)}") | ||
|
||
if __name__ == '__main__': | ||
md = MD() | ||
output = md.run_MD("Personal Information: Name: Harvey Spectre Date of Birth: July 18, 1972 Address: 123 Pearson Street, New York, NY Email Address: harvey.spectre@example.com Phone Number: (555) 555-1234 National Identification Number: 123-45-6789 IP Address: 192.168.1.100 Social Media Profile: @HarveySpectreLaw (Twitter) Special Categories of Personal Data: Health Data: Harvey Spectre's medical records detail his health history, including treatment for a sports-related injury sustained during his college years and regular check-ups for managing hypertension. Medication records show prescriptions for blood pressure management and occasional pain relief medication.") | ||
print(len(output)) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import spacy | ||
import os | ||
base_dir = os.path.dirname(__file__) | ||
model_path = os.path.join(base_dir, 'Entity_builder') | ||
|
||
model = spacy.load("en_core_web_sm") | ||
|
||
class NER: | ||
|
||
# def __init__(self): | ||
|
||
def extract_entities(self, res): | ||
entities = [] | ||
for i in res.ents: | ||
entities.append((i.text, i.label_)) | ||
return entities | ||
|
||
def run_NER(self, text): | ||
res = model(text) | ||
processed = self.extract_entities(res) | ||
return processed | ||
|
||
if __name__ == '__main__': | ||
ner_ = NER() | ||
x = input("Enter a sentence: ") | ||
res = ner_.run_NER(x) | ||
print(res) | ||
|
Empty file.
Oops, something went wrong.