Merge pull request #187 from COS301-SE-2024/develop

Merge develop into main
COS301-SE-2024 · Aug 12, 2024 · 7edc4f5 · 7edc4f5
2 parents eb4f6f8 + 3e89189
commit 7edc4f5
Show file tree

Hide file tree

Showing 128 changed files with 39,498 additions and 699 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.safetensors filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/configs/.flake8 b/.github/configs/.flake8
@@ -1,2 +1,2 @@
 [flake8]
-extend-ignore = W292, W291, E302, W293, E501, E261, E262
+extend-ignore = W292, W291, E302, W293, E501, E261, E262, E402
diff --git a/.github/workflows/frontendTests.yml b/.github/workflows/frontendTests.yml
@@ -5,7 +5,6 @@ name: Node.js CI
 
 on:
   push:
-  pull_request:
 
 jobs:
   build:

diff --git a/.github/workflows/python-app-test.yml b/.github/workflows/python-app-test.yml
@@ -7,7 +7,6 @@ name: Python application
 
 on:
   push:
-  pull_request:
 
 
 permissions:
@@ -52,6 +51,7 @@ jobs:
             python -c "import nltk; nltk.download('punkt')"
       - name: Install dependencies
         run: |
+          sudo apt update
           sudo apt-get install poppler-utils
           sudo apt-get install tesseract-ocr
           python -m pip install --upgrade pip
@@ -63,6 +63,7 @@ jobs:
         run: |
           ls -a
           cd backend/Document_parser
+          # cd backend
           pytest --cov=. --cov-report=xml
       
       - name: Integration test

diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,8 @@ obj/
 **/__pycache__/
 venv/
 myenv/
-
+**/results/
+__pycache__/
+.DS_Store
+**/.DS_Store
+env
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,11 @@
+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "./backend",
+        "-p",
+        "*test*.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true
+}
diff --git a/README.md b/README.md
@@ -30,18 +30,31 @@ The GDPR Data Noncompliance Detector is a software tool designed to identify ins
 
 
 ### Demo 3
+[Demo 3 SRS Documentation](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Demo_3___SRS.pdf)
+<br></br>
+[Demo 3 Video Presentation](https://drive.google.com/file/d/1MiGo9iybk75pbj5HlVkt20Jh9sM_36tb/view?usp=drive_link)
+<br></br>
 
 ### Demo 4
 
 # Documentation
 
 ## Software Requirements Specification
-[SRS Document](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Demo_2___SRS.pdf)
+[SRS Document](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Demo_3___SRS.pdf)
 
 ## Architectural Document
-[Architectural Document](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Demo_2___Architectural_Document.pdf)
+[Architectural Document](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Demo_3___Architectural_Document-1.pdf)
 <br></br>
-[Architectural Diagram](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Architectural%20Document.png)
+[Architectural Diagram](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/ArchiV2.png)
+
+## Technical Installation Manual
+[Technical Installation](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Technical_Installation_Manual-2.pdf)
+
+## Coding Standards
+[Coding Standards](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/Coding_Standards.pdf)
+
+## User Manual
+[GND User Manual](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/develop/documentation/GND%20Manual.pdf)
 
 ## Project Management Tools
 GitHub Issues and GitHub Boards

diff --git a/backend/.DS_Store b/backend/.DS_Store
diff --git a/backend/Detection_Engine/.DS_Store b/backend/Detection_Engine/.DS_Store
diff --git a/backend/Detection_Engine/.gitattributes b/backend/Detection_Engine/.gitattributes
@@ -0,0 +1 @@
+trained_model/* filter=lfs diff=lfs merge=lfs -text
diff --git a/backend/Detection_Engine/CA_model_access.py b/backend/Detection_Engine/CA_model_access.py
@@ -0,0 +1,32 @@
+import os
+from transformers import BertTokenizer, BertForSequenceClassification, pipeline
+
+# base_dir = os.path.dirname(__file__)
+# model_path = os.path.join(base_dir, 'trained_model')
+# tokenizer_path = os.path.join(base_dir,'ca_tokenizer')
+
+# tokenizer_ = AutoTokenizer.from_pretrained("rdhinaz/gdpr_consent_agreement")
+# mode+l = AutoModelForSequenceClassification.from_pretrained("rdhinaz/gdpr_consent_agreement")
+
+class CA:
+
+    def __init__(self):
+        # self.model_ = model
+        # self.tokenizer_ = tokenizer
+        self.max_length_ = 512
+        self.classifier = pipeline("text-classification", model="rdhinaz/gdpr_consent_agreement")
+        self.max_length = self.max_length_
+
+    def run_CA(self, text):
+        return self.predict(text)
+
+    def predict(self, input_text):
+        tokens = self.classifier.tokenizer.encode(input_text, truncation=True, max_length=self.max_length_)
+        truncated_text = self.classifier.tokenizer.decode(tokens, skip_special_tokens=True)
+
+        result = self.classifier(truncated_text)
+        return result[0]['label']
+
+if __name__ == '__main__':
+    ca = CA()
+    print(ca.run_CA("Sure, you can use my data"))
diff --git a/backend/Detection_Engine/EM_model_access.py b/backend/Detection_Engine/EM_model_access.py
@@ -0,0 +1,41 @@
+import os
+from transformers import BertTokenizer, BertForSequenceClassification, pipeline
+
+class EM:
+
+    def __init__(self):
+        # self.model_ = model
+        # self.tokenizer_ = tokenizer
+        self.max_length_ = 512
+        self.classifier = pipeline("text-classification", model="BananaFish45/Ethnicity_model")
+        self.max_length = self.max_length_
+
+    def run_EM(self, text):
+        return self.predict(text)
+
+    def predict(self, input_text):
+        count = 0
+        tokens = self.classifier.tokenizer.encode(input_text, truncation=True, max_length=self.max_length_)
+        truncated_text = self.classifier.tokenizer.decode(tokens, skip_special_tokens=True)
+
+        result = self.classifier(truncated_text)
+        label = result[0]['label']
+        if label != 'LABEL_2':
+            count += 1
+        else:
+            count += 0
+
+        return count
+
+if __name__ == '__main__':
+    em = EM()
+    print(em.run_EM("Sure, you can use my data"))#No ethnicity
+    print(em.run_EM("Paul is an Indian"))#Indian
+    print(em.run_EM("Dhinaz is an African young man very strong"))#African
+    print(em.run_EM("Yeshlen is an Asian business man crooked man indeed"))#Asian
+    print(em.run_EM("Yudi is a Middle Eastern Rug business man honest guy hey but loves money"))#Middle Eastern
+    print(em.run_EM("Jonas is a school admin clerk"))#No Ethnicity 
+    print(em.run_EM("Micheal is an Caucasian programmer "))#European
+    print(em.run_EM("Nevin is an Hispanic scammer"))#Hispanic
+    print(em.run_EM("Samantha is an African American"))#African American
+
diff --git a/backend/Detection_Engine/GDPR_model_access.py b/backend/Detection_Engine/GDPR_model_access.py
@@ -0,0 +1,43 @@
+import os
+from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, pipeline
+
+
+class GDPR:
+
+    def __init__(self):
+        self.max_length_ = 512
+        self.classifier = pipeline("text-classification", model="BananaFish45/gdpr_personal_data")
+        self.max_length = self.max_length_
+
+    def run_GDPR(self, text):
+        return self.predict(text)
+
+    def predict(self, input_text):
+        count  = 0
+        tokens = self.classifier.tokenizer.encode(input_text, truncation=True, max_length=self.max_length_)
+        truncated_text = self.classifier.tokenizer.decode(tokens, skip_special_tokens=True)
+
+        result = self.classifier(truncated_text)
+        # return result[0]['label']
+
+        label = result[0]['label']
+        if label != 'LABEL_0':
+            count += 1
+        else:
+            count += 0
+
+        return count
+
+
+if __name__ == '__main__':
+    gdpr = GDPR()
+    print(gdpr.run_GDPR("Samantha Schultz lives at 5365 Julie Union, Lake Jorgemouth, PA 90372. Contact them at johnsonjames@valencia-boyer.com or 5020153600.,Samantha,Schultz,S,2020-06-20,5365 Julie Union, Lake Jorgemouth, PA 90372,johnsonjames@valencia-boyer.com,5020153600,+1-762-972-3354,042-45-5223,XD20386955,445-92-8929,950-96-6247"))#Personal Data
+    print(gdpr.run_GDPR("Food life marriage book if business. Long beat accept tell. Support method field agency several break citizen amount."))#No personal data
+    print(gdpr.run_GDPR("John Doe's email is john.doe@example.com and his phone number is 123-456-7890."))#Personal data
+    print(gdpr.run_GDPR("Yeshlen Moddley is an Asian business man crooked man indeed"))#Personal data
+    print(gdpr.run_GDPR("This document contains general information about the company's financials"))#No personal data
+    print(gdpr.run_GDPR("Jonas Motha is a school admin clerk"))#No Ethnicity 
+    print(gdpr.run_GDPR("Micheal Simon is an Caucasian programmer "))#Personal data
+    print(gdpr.run_GDPR("Nevin Thomas is an Hispanic scammer"))#Personal data
+    print(gdpr.run_GDPR("Samantha Williams is an African American"))#Personal data
+
diff --git a/backend/Detection_Engine/GND_LSG.jpg b/backend/Detection_Engine/GND_LSG.jpg
diff --git a/backend/Detection_Engine/MD_model_access.py b/backend/Detection_Engine/MD_model_access.py
@@ -0,0 +1,35 @@
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+from transformers import pipeline
+
+class MD:
+
+    def __init__(self):
+        tokenizer = AutoTokenizer.from_pretrained("ugaray96/biobert_ncbi_disease_ner")
+        model = AutoModelForTokenClassification.from_pretrained(
+            "ugaray96/biobert_ncbi_disease_ner"
+        )
+
+        self.ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
+
+    def run_MD(self, input):
+        # result = self.ner_pipeline(input)
+        return self.predict(input)
+
+    def predict(self, input):
+
+        result = self.ner_pipeline(input)
+
+        diseases = []
+        for entity in result:
+            if entity["entity"] == "Disease":
+                diseases.append(entity["word"])
+            elif entity["entity"] == "Disease Continuation" and diseases:
+                diseases[-1] += f" {entity['word']}"
+
+        return diseases
+        # print(f"Diseases: {', '.join(diseases)}")
+
+if __name__ == '__main__':
+    md = MD()
+    output = md.run_MD("Personal Information: Name: Harvey Spectre Date of Birth: July 18, 1972 Address: 123 Pearson Street, New York, NY Email Address: harvey.spectre@example.com Phone Number: (555) 555-1234 National Identification Number: 123-45-6789 IP Address: 192.168.1.100 Social Media Profile: @HarveySpectreLaw (Twitter) Special Categories of Personal Data: Health Data: Harvey Spectre's medical records detail his health history, including  treatment for a sports-related injury sustained during his college years and regular check-ups for managing hypertension. Medication records show prescriptions for blood pressure management and occasional pain relief medication.")
+    print(len(output))
diff --git a/backend/Detection_Engine/Mediator Narrow Web Extra Bold.ttf b/backend/Detection_Engine/Mediator Narrow Web Extra Bold.ttf
diff --git a/backend/Detection_Engine/NER_model_access.py b/backend/Detection_Engine/NER_model_access.py
@@ -0,0 +1,28 @@
+import spacy
+import os
+base_dir = os.path.dirname(__file__)
+model_path = os.path.join(base_dir, 'Entity_builder')
+
+model = spacy.load("en_core_web_sm")
+
+class NER:
+
+    # def __init__(self):
+
+    def extract_entities(self, res):
+        entities = []
+        for i in res.ents:
+            entities.append((i.text, i.label_))
+        return entities
+
+    def run_NER(self, text):
+        res = model(text)
+        processed = self.extract_entities(res)
+        return processed
+
+if __name__ == '__main__':
+    ner_ = NER()
+    x = input("Enter a sentence: ")
+    res = ner_.run_NER(x)
+    print(res)
+
diff --git a/backend/Detection_Engine/__init__.py b/backend/Detection_Engine/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		*.safetensors filter=lfs diff=lfs merge=lfs -text
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,6 @@ name: Node.js CI @@
     on:
       push:
-      pull_request:
     jobs:
       build:
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		trained_model/* filter=lfs diff=lfs merge=lfs -text