diff --git a/pebblo/app/api/api.py b/pebblo/app/api/api.py index 1a7f040c..4fd7295e 100644 --- a/pebblo/app/api/api.py +++ b/pebblo/app/api/api.py @@ -1,5 +1,6 @@ from fastapi import APIRouter -from pebblo.app.service.service import AppDiscover, AppLoaderDoc +from pebblo.app.service.service import AppLoaderDoc +from pebblo.app.service.discovery_service import AppDiscover class App: diff --git a/pebblo/app/enums/enums.py b/pebblo/app/enums/enums.py index c63874d1..e069fef6 100644 --- a/pebblo/app/enums/enums.py +++ b/pebblo/app/enums/enums.py @@ -19,3 +19,4 @@ class CacheDir(Enum): class ReportConstants(Enum): snippets_limit = 100 top_findings_limit = 5 + loader_history_limit = 5 diff --git a/pebblo/app/models/models.py b/pebblo/app/models/models.py index b820fc01..edc81709 100644 --- a/pebblo/app/models/models.py +++ b/pebblo/app/models/models.py @@ -1,11 +1,12 @@ from pydantic import BaseModel, Field from typing import Optional, List, Union from datetime import datetime +from uuid import UUID class Metadata(BaseModel): - createdAt: datetime = datetime.now() - modifiedAt: datetime = datetime.now() + createdAt: datetime + modifiedAt: datetime class Config: arbitrary_types_allowed = True @@ -17,7 +18,7 @@ class LoaderMetadata(BaseModel): sourceType: str sourceSize: int sourceFiles: Optional[list] = [] - lastModified: Optional[datetime] = datetime.now() + lastModified: Optional[datetime] class AiDataModel(BaseModel): @@ -39,7 +40,6 @@ class AiDocs(BaseModel): entities: dict topicCount: int topics: dict - policyViolations: Optional[List[dict]] = [] class FrameworkInfo(BaseModel): @@ -58,7 +58,7 @@ class InstanceDetails(BaseModel): platform: Optional[str] os: Optional[str] osVersion: Optional[str] - createdAt: datetime = datetime.now() + createdAt: datetime class AiApp(BaseModel): @@ -77,10 +77,10 @@ class Summary(BaseModel): findingsEntities: int findingsTopics: int totalFiles: int - filesWithRestrictedData: int + filesWithFindings: int dataSources: int owner: str - createdAt: datetime = datetime.now() + createdAt: datetime class TopFindings(BaseModel): @@ -110,12 +110,20 @@ class DataSource(BaseModel): # snippets: Optional[List[Snippets]] +class LoadHistory(BaseModel): + loadId: UUID + reportName: str + findings: int + filesWithFindings: int + generatedOn: datetime + + class ReportModel(BaseModel): name: str description: Optional[str] framework: Optional[FrameworkInfo] = Field(default_factory=FrameworkInfo) reportSummary: Optional[Summary] + loadHistory: Optional[dict] topFindings: Optional[List[TopFindings]] instanceDetails: Optional[InstanceDetails] dataSources: Optional[List[DataSource]] - lastModified: datetime diff --git a/pebblo/app/service/discovery_service.py b/pebblo/app/service/discovery_service.py new file mode 100644 index 00000000..61e4bc7a --- /dev/null +++ b/pebblo/app/service/discovery_service.py @@ -0,0 +1,134 @@ +from datetime import datetime +from pebblo.app.enums.enums import CacheDir +from pebblo.app.utils.utils import write_json_to_file, read_json_file +from pebblo.app.libs.logger import logger +from pebblo.app.models.models import Metadata, AiApp, InstanceDetails +from pydantic import ValidationError +from fastapi import HTTPException + + +class AppDiscover: + def __init__(self, data: dict): + self.data = data + self.load_id = data.get('load_id') + self.application_name = self.data.get("name") + + def _create_ai_apps_model(self, instance_details): + """ + Create an AI App Model and return the corresponding model object + """ + logger.debug("Creating AI App model") + # Initialize Variables + last_used = datetime.now() + metadata = Metadata( + createdAt=datetime.now(), + modifiedAt=datetime.now() + ) + ai_apps_model = AiApp( + metadata=metadata, + name=self.data.get("name"), + description=self.data.get("description", "-"), + owner=self.data.get('owner'), + pluginVersion=self.data.get("plugin_version"), + instanceDetails=instance_details, + framework=self.data.get("framework"), + lastUsed=last_used + ) + return ai_apps_model + + def _fetch_runtime_instance_details(self): + """ + Retrieve instance details from input data and return its corresponding model object. + """ + logger.debug("Retrieving instance details from input data") + # Fetching runtime instance details + runtime_dict = self.data.get("runtime", {}) + instance_details_model = InstanceDetails( + language=runtime_dict.get("language"), + languageVersion=runtime_dict.get("language_version"), + host=runtime_dict.get("host"), + ip=runtime_dict.get("ip"), + path=runtime_dict.get("path"), + runtime=runtime_dict.get("runtime"), + type=runtime_dict.get("type"), + platform=runtime_dict.get("platform"), + os=runtime_dict.get("os"), + osVersion=runtime_dict.get("os_version"), + createdAt=datetime.now() + ) + logger.debug(f"AI_APPS [{self.application_name}]: Instance Details: {instance_details_model.dict()}") + return instance_details_model + + @staticmethod + def _write_file_content_to_path(file_content, file_path): + """ + Write content to the specified file path + """ + logger.debug(f"Writing content to file path: {file_content}") + # Writing file content to given file path + write_json_to_file(file_content, file_path) + + @staticmethod + def _read_file(file_path): + """ + Retrieve the content of the specified file. + """ + logger.debug(f"Reading content from file: {file_path}") + file_content = read_json_file(file_path) + return file_content + + def _upsert_app_metadata_file(self): + """ + Update/Create app metadata file and write metadata for current run + """ + # Read metadata file & get current app metadata + app_metadata_file_path = (f"{CacheDir.home_dir.value}/" + f"{self.application_name}/{CacheDir.metadata_file_path.value}") + app_metadata = self._read_file(app_metadata_file_path) + + # write metadata file if it is not present + if not app_metadata: + # Writing app metadata to metadata file + app_metadata = {"name": self.application_name, "load_ids": [self.load_id]} + else: + if "load_ids" in app_metadata.keys(): + # Metadata file is already present, Appending the current metadata details + app_metadata.get("load_ids").append(self.load_id) + else: + # metadata file is present, but load_ids is not, This is to support backward compatibility + app_metadata["load_ids"] = [self.load_id] + + # Writing metadata file + self._write_file_content_to_path(app_metadata, app_metadata_file_path) + + def process_request(self): + """ + Process AI App discovery Request + """ + try: + logger.debug("AI App discovery request processing started") + # Input Data + logger.debug(f"AI_APP [{self.application_name}]: Input Data: {self.data}") + + # Upset metadata file + self._upsert_app_metadata_file() + + # getting instance details + instance_details = self._fetch_runtime_instance_details() + + # create AiApps Model + ai_apps = self._create_ai_apps_model(instance_details) + + # Write file to metadata location + file_path = (f"{CacheDir.home_dir.value}/{self.application_name}/{self.load_id}" + f"/{CacheDir.metadata_file_path.value}") + self._write_file_content_to_path(ai_apps.dict(), file_path) + + logger.debug(f"AiApp discovery request completed successfully") + return {"message": "App Discover Request Processed Successfully"} + except ValidationError as ex: + logger.error(f"Error in process_request. Error:{ex}") + raise HTTPException(status_code=400, detail=str(ex)) + except Exception as ex: + logger.error(f"Error in process_request. Error:{ex}") + raise HTTPException(status_code=500, detail="Internal Server Error") diff --git a/pebblo/app/service/doc_helper.py b/pebblo/app/service/doc_helper.py index 727b020c..258926e9 100644 --- a/pebblo/app/service/doc_helper.py +++ b/pebblo/app/service/doc_helper.py @@ -1,19 +1,21 @@ """ Doc helper class for loader doc related task """ - +import ast +import os.path from datetime import datetime from pebblo.app.libs.logger import logger -from pebblo.app.models.models import AiDataModel, AiDocs, ReportModel, Snippets, Summary, DataSource +from pebblo.app.models.models import AiDataModel, AiDocs, ReportModel, Snippets, Summary, DataSource, LoadHistory +from pebblo.app.utils.utils import read_json_file, get_full_path from pebblo.entity_classifier.entity_classifier import EntityClassifier from pebblo.topic_classifier.topic_classifier import TopicClassifier -from pebblo.app.enums.enums import ReportConstants +from pebblo.app.enums.enums import ReportConstants, CacheDir # Init topic classifier topic_classifier_obj = TopicClassifier() -class DocHelper: +class LoaderHelper: def __init__(self, app_details, data, load_id): self.app_details = app_details self.data = data @@ -21,6 +23,108 @@ def __init__(self, app_details, data, load_id): self.loader_mapper = {} self.entity_classifier_obj = EntityClassifier() + # Initialization + def _initialize_raw_data(self): + """ + Initializing raw data and return as dict object + """ + if "report_metadata" in self.app_details.keys(): + return self.app_details['report_metadata'] + + raw_data = {"total_findings": 0, "findings_entities": 0, "findings_topics": 0, + "data_source_count": 1, "data_source_snippets": list(), + "loader_source_snippets": {}, "file_count": 0, + "snippet_count": 0, "data_source_findings": {}, + "snippet_counter": 0, "total_snippet_counter": 0} + return raw_data + + @staticmethod + def _fetch_variables(raw_data): + """ + Return list of variable's + """ + # Initializing variables + return ( + raw_data.get("loader_source_snippets"), + raw_data.get("total_findings"), + raw_data.get("findings_entities"), + raw_data.get("findings_topics"), + raw_data.get("snippet_count"), + raw_data.get("file_count"), + raw_data.get("data_source_findings") + ) + + @staticmethod + def _update_raw_data(raw_data, loader_source_snippets, total_findings, findings_entities, findings_topics, + snippet_count, file_count, data_source_findings): + """ + Reassigning raw data + """ + raw_data.update({ + "loader_source_snippets": loader_source_snippets, + "total_findings": total_findings, + "findings_entities": findings_entities, + "findings_topics": findings_topics, + "snippet_count": snippet_count, + "file_count": file_count, + "data_source_findings": data_source_findings + }) + + # Model Creation + def _create_doc_model(self, doc, doc_info): + """ + Create doc model and return its object + """ + loader_details = self.data.get("loader_details", {}) + last_used = datetime.now() + doc_model = AiDocs(appId=self.load_id, + doc=doc.get('doc'), + sourceSize=doc.get('source_path_size', 0), + fileOwner=doc.get('file_owner', '-'), + sourcePath=doc.get('source_path'), + loaderSourcePath=loader_details.get("source_path"), + lastModified=last_used, + entityCount=doc_info.entityCount, + entities=doc_info.entities, + topicCount=doc_info.topicCount, + topics=doc_info.topics) + return doc_model.dict() + + @staticmethod + def _get_top_n_findings(raw_data): + """ + Return top N findings from all findings + """ + logger.debug("Getting top N findings details and aggregate them") + loader_source_snippets = raw_data["loader_source_snippets"] + top_n_findings_list = sorted(loader_source_snippets.items(), key=lambda x: x[1]['findings'], reverse=True)[ + :ReportConstants.top_findings_limit.value] + top_n_findings = [ + { + "fileName": key, + "fileOwner": "-" if value.get("fileOwner", "-") is None else value.get("fileOwner", "-"), + "sourceSize": 0 if value.get("sourceSize", 0) is None else value.get("sourceSize", 0), + "findingsEntities": value['findings_entities'], + "findingsTopics": value['findings_topics'], + "findings": value['findings'] + } + for key, value in top_n_findings_list + ] + return top_n_findings + + def _count_files_with_findings(self): + """ + Return the count of files that have associated findings. + """ + logger.debug("Fetching the count of files that have associated findings") + files_with_findings_count = 0 + loader_details = self.app_details.get("loaders", {}) + for loader in loader_details: + for file_dict in loader["sourceFiles"]: + if "findings" in file_dict.keys() and file_dict["findings"] > 0: + files_with_findings_count += 1 + return files_with_findings_count + def _get_classifier_response(self, doc): doc_info = AiDataModel(data=doc.get("doc", None), entities={}, entityCount=0, @@ -41,48 +145,223 @@ def _get_classifier_response(self, doc): logger.error(f"Get Classifier Response Failed, Exception: {e}") return doc_info - def _get_finding_details(self, doc, data_source_findings, entity_type, file_count, raw_data): + def _update_app_details(self, raw_data, ai_app_docs): + """ + Updating ai app details loader source files + """ + logger.debug("Updating app details") + self.app_details["docs"] = ai_app_docs + loader_source_snippets = raw_data["loader_source_snippets"] + # Updating app_details doc list and loader source files + loader_details = self.app_details.get("loaders", {}) + for loader in loader_details: + for source_file in loader.get("sourceFiles", []): + name = source_file["name"] + if name not in loader_source_snippets: + loader_source_snippets[name] = source_file + + new_source_files = [{ + "name": key, + "findings_entities": value['findings_entities'], + "findings_topics": value['findings_topics'], + "findings": value['findings'] + } + for key, value in loader_source_snippets.items() + ] + + loader["sourceFiles"] = new_source_files + self.app_details["report_metadata"] = raw_data + + @staticmethod + def _get_finding_details(doc, data_source_findings, entity_type, raw_data): + """ + Retrieve finding details from data source + """ + logger.debug(f"Fetching finding details from data source for entity type: {entity_type}") source_path = doc.get("sourcePath") snippet = Snippets(snippet=doc["doc"], sourcePath=source_path, - fileOwner=doc.get("fileOwner", " ")) + fileOwner=doc.get("fileOwner", "-")) for label_name, value in doc[entity_type].items(): if label_name in data_source_findings.keys(): data_source_findings[label_name]["snippetCount"] += 1 data_source_findings[label_name]["findings"] += value - data_source_findings[label_name]["unique_snippets"].add(source_path) raw_data["total_snippet_counter"] += 1 + + unique_snippets_set = data_source_findings[label_name]["unique_snippets"] + if isinstance(unique_snippets_set, str): + # When we write data_source_findings[label_name]['unique_snippets'] to metadata file, + # it gets stored as str. We would need it as set again for further processing. + # This is why we are using as.literal_eval() here. + unique_snippets_set = ast.literal_eval(data_source_findings[label_name]['unique_snippets']) + unique_snippets_set.add(source_path) + data_source_findings[label_name]["fileCount"] = len(unique_snippets_set) + data_source_findings[label_name]["unique_snippets"] = unique_snippets_set + + # If the snippet count exceeds the snippet limit, + # we will refrain from adding the snippet to the snippet list if raw_data["snippet_counter"] < ReportConstants.snippets_limit.value: data_source_findings[label_name]["snippets"].append(snippet.dict()) raw_data["snippet_counter"] += 1 - data_source_findings[label_name]["fileCount"] = len(data_source_findings[label_name]["unique_snippets"]) else: - dict_obj = {f"labelName": label_name, "findings": value, "findingsType": entity_type, "snippetCount": 1, - "fileCount": file_count} + # The source path is encountered for the first time, so we are initializing its object. + dict_obj = { + "labelName": label_name, + "findings": value, + "findingsType": entity_type, + "snippetCount": 1, + "fileCount": 1 + } data_source_findings[label_name] = dict_obj + data_source_findings[label_name]["unique_snippets"] = set() + data_source_findings[label_name]["unique_snippets"].add(source_path) raw_data["total_snippet_counter"] += 1 + + # If the snippet count exceeds the snippet limit, + # we will refrain from adding the snippet to the snippet list if raw_data["snippet_counter"] < ReportConstants.snippets_limit.value: data_source_findings[label_name]["snippets"] = [snippet.dict()] raw_data["snippet_counter"] += 1 else: data_source_findings[label_name]["snippets"] = [] - data_source_findings[label_name]["unique_snippets"] = set() - data_source_findings[label_name]["unique_snippets"].add(source_path) + def _get_data_source_details(self, raw_data): + """ + Create data source findings details and data source findings summary + """ + logger.debug("Aggregating data source details") + data_source_obj_list = list() + for loader in self.app_details["loaders"]: + name = loader.get("name") + source_path = loader.get("sourcePath") + source_type = loader.get("sourceType") + source_size = loader.get("sourceSize") + total_snippet_count = raw_data["total_snippet_counter"] + displayed_snippet_count = raw_data["snippet_counter"] + data_source_findings = [{key: value[key] for key in value if key != value[key] and key != "unique_snippets"} + for value in + raw_data["data_source_findings"].values()] + + # Create data source findings summary from data source findings + data_source_findings_summary = self._create_data_source_findings_summary(data_source_findings) + + data_source_obj = DataSource(name=name, + sourcePath=source_path, + sourceType=source_type, + sourceSize=source_size, + totalSnippetCount=total_snippet_count, + displayedSnippetCount=displayed_snippet_count, + findingsSummary=data_source_findings_summary, + findingsDetails=data_source_findings + ) + data_source_obj_list.append(data_source_obj) + return data_source_obj_list + + @staticmethod + def _create_data_source_findings_summary(data_source_findings): + """ + Creating data source findings summary and return it findings summary list + """ + logger.debug("Creating data source summary") + data_source_findings_summary = [] + for ds_findings in data_source_findings: + label_name = ds_findings.get("labelName", "") + findings = ds_findings.get("findings", 0) + findings_type = ds_findings.get("findingsType") + snippet_count = ds_findings.get("snippetCount", 0) + file_count = ds_findings.get("fileCount", 0) + + data_source_findings_summary.append({ + "labelName": label_name, + "findings": findings, + "findingsType": findings_type, + "snippetCount": snippet_count, + "fileCount": file_count + }) + + return data_source_findings_summary + + def _create_report_summary(self, raw_data, files_with_findings_count): + """ + Return report summary object + """ + logger.debug("Creating report summary") + report_summary = Summary( + findings=raw_data["total_findings"], + findingsEntities=raw_data["findings_entities"], + findingsTopics=raw_data["findings_topics"], + totalFiles=raw_data["file_count"], + filesWithFindings=files_with_findings_count, + dataSources=raw_data["data_source_count"], + owner=self.app_details["owner"], + createdAt=datetime.now() + ) + return report_summary + + def _get_load_history(self): + """ + Retrieve previous runs details and create load history and return + """ + logger.debug("Fetching previous execution details and creating loader history") + load_history = dict() + # Reading metadata file & get load details + app_name = self.data.get("name") + current_load_id = self.load_id + app_metadata_file_path = f"{CacheDir.home_dir.value}/{app_name}/{CacheDir.metadata_file_path.value}" + app_metadata = read_json_file(app_metadata_file_path) + if not app_metadata: + # No app metadata is present + return load_history + load_ids = app_metadata.get("load_ids", []) + + # Retrieving load id report file + # LoadHistory will be considered up to the specified load history limit. + # if no of reports are greater than specified limit than, we provide the dir path for all reports + load_history["history"] = list() + load_history["moreReportsPath"] = "-" + report_counts = len(load_ids) + top_n_latest_loader_id = load_ids[-ReportConstants.loader_history_limit.value - 1:] + top_n_latest_loader_id.reverse() + + for load_id in top_n_latest_loader_id: + if load_id == current_load_id: + continue + load_report_file_path = f"{CacheDir.home_dir.value}/{app_name}/{load_id}/{CacheDir.report_data_file_name.value}" + report = read_json_file(load_report_file_path) + if report: + pdf_report_path = f"{CacheDir.home_dir.value}/{app_name}/{load_id}/{CacheDir.report_file_name.value}" + report_name = get_full_path(pdf_report_path) + if not os.path.exists(report_name): + # Pdf file is not present, Skipping it + continue + # create loader history object + report_summary = report.get("reportSummary") + load_history_model_obj = LoadHistory(loadId=load_id, + reportName=report_name, + findings=report_summary["findings"], + filesWithFindings=report_summary["filesWithFindings"], + generatedOn=report_summary["createdAt"] + ) + load_history["history"].append(load_history_model_obj.dict()) + if (len(load_history["history"]) == ReportConstants.loader_history_limit.value + and report_counts > ReportConstants.loader_history_limit.value+1): + more_reports = f"{CacheDir.home_dir.value}/{app_name}/" + more_report_full_path = get_full_path(more_reports) + load_history["moreReportsPath"] = more_report_full_path + return load_history def _get_doc_report_metadata(self, doc, raw_data): + """ + Retrieve metadata from the document, update the raw data, and then return the updated raw data. + """ + logger.debug("fetching report data from input and aggregating data") # Initialize variables - loader_source_snippets = raw_data["loader_source_snippets"] - total_findings = raw_data["total_findings"] - findings_entities = raw_data["findings_entities"] - findings_topics = raw_data["findings_topics"] - snippet_count = raw_data["snippet_count"] - file_count = raw_data["file_count"] - data_source_findings = raw_data["data_source_findings"] - + (loader_source_snippets, total_findings, findings_entities, findings_topics, + snippet_count, file_count, data_source_findings) = self._fetch_variables(raw_data) # getting snippet details only if snippet has findings entities or topics. findings = doc["entityCount"] + doc["topicCount"] source_path = doc.get("sourcePath") + # If source path is already present, then add values if source_path in loader_source_snippets.keys(): loader_source_snippets[source_path]["findings_entities"] = ( @@ -109,152 +388,70 @@ def _get_doc_report_metadata(self, doc, raw_data): loader_source_snippets[source_path]["sourceSize"] = doc['sourceSize'] if len(doc["topics"]) > 0: - self._get_finding_details(doc, data_source_findings, "topics", file_count, raw_data) + self._get_finding_details(doc, data_source_findings, "topics", raw_data) if len(doc["entities"]) > 0: - self._get_finding_details(doc, data_source_findings, "entities", file_count, raw_data) + self._get_finding_details(doc, data_source_findings, "entities", raw_data) # Replace report_metadata - raw_data["loader_source_snippets"] = loader_source_snippets - raw_data["total_findings"] = total_findings - raw_data["findings_entities"] = findings_entities - raw_data["findings_topics"] = findings_topics - raw_data["snippet_count"] = snippet_count - raw_data["file_count"] = file_count - raw_data["data_source_findings"] = data_source_findings + self._update_raw_data(raw_data, loader_source_snippets, total_findings, findings_entities, findings_topics, + snippet_count, file_count, data_source_findings) return raw_data - def _get_data_source_details(self, raw_data): - data_source_obj_list = list() - for loader in self.app_details["loaders"]: - name = loader.get("name") - source_path = loader.get("sourcePath") - source_type = loader.get("sourceType") - source_size = loader.get("sourceSize") - total_snippet_count = raw_data["total_snippet_counter"] - displayed_snippet_count = raw_data["snippet_counter"] - data_source_findings = [{key: value[key] for key in value if key != value[key] and key != "unique_snippets"} for value in - raw_data["data_source_findings"].values()] - data_source_findings_summary = [] - for ds_findings in data_source_findings: - label_name = ds_findings.get("labelName", "") - findings = ds_findings.get("findings", 0) - findings_type = ds_findings.get("findingsType") - snippet_count = ds_findings.get("snippetCount", 0) - file_count = ds_findings.get("fileCount", 0) - - data_source_findings_summary.append({ - "labelName": label_name, - "findings": findings, - "findingsType": findings_type, - "snippetCount": snippet_count, - "fileCount": file_count - }) - data_source_obj = DataSource(name=name, - sourcePath=source_path, - sourceType=source_type, - sourceSize=source_size, - totalSnippetCount = total_snippet_count, - displayedSnippetCount = displayed_snippet_count, - findingsSummary=data_source_findings_summary, - findingsDetails=data_source_findings - ) - data_source_obj_list.append(data_source_obj) - return data_source_obj_list - def _generate_final_report(self, raw_data): - loader_source_snippets = raw_data["loader_source_snippets"] - file_count_restricted_data = 0 - for file_dict in self.app_details["loader_source_files"]: - if "findings" in file_dict.keys(): - if file_dict["findings"] > 0: - file_count_restricted_data += 1 + """ + Aggregating all input, processing the data, and generating the final report + """ + logger.debug("Generating final report") - report_summary = Summary( - findings=raw_data["total_findings"], - findingsEntities=raw_data["findings_entities"], - findingsTopics=raw_data["findings_topics"], - totalFiles=raw_data["file_count"], - filesWithRestrictedData=file_count_restricted_data, - dataSources=raw_data["data_source_count"], - owner=self.app_details["owner"] - ) + # get count of files that have associated findings. + files_with_findings_count = self._count_files_with_findings() - # Get top N findings, currently 5 - top_n_findings = sorted(loader_source_snippets.items(), key=lambda x: x[1]['findings'], reverse=True)[:ReportConstants.top_findings_limit.value] - top_n_finding_objects = [ - { - "fileName": key, - "fileOwner": "-" if value.get("fileOwner", "-") is None else value.get("fileOwner", "-"), - "sourceSize": 0 if value.get("sourceSize", 0) is None else value.get("sourceSize", 0), - "findingsEntities": value['findings_entities'], - "findingsTopics": value['findings_topics'], - "findings": value['findings'] - } - for key, value in top_n_findings - ] + # Create report summary + report_summary = self._create_report_summary(raw_data, files_with_findings_count) + + # get top N findings + top_n_findings = self._get_top_n_findings(raw_data) # Generating DataSource data_source_obj_list = self._get_data_source_details(raw_data) + + # Retrieve LoadHistory From previous executions + load_history = self._get_load_history() + report_dict = ReportModel( name=self.app_details["name"], description=self.app_details.get("description", "-"), instanceDetails=self.app_details["instanceDetails"], framework=self.app_details["framework"], reportSummary=report_summary, - topFindings=top_n_finding_objects, - lastModified=datetime.now(), + loadHistory=load_history, + topFindings=top_n_findings, dataSources=data_source_obj_list ) return report_dict.dict() def process_docs_and_generate_report(self): - loader_details = self.data.get("loader_details", {}) - # should be list of loader obj - self.loader_mapper[loader_details.get("source_path")] = {"fileOwner": self.data.get("source_owner"), - "sourceSize": loader_details.get("source_size"), - "type": loader_details.get("source_type")} + """ + Processing the doc and aggregate the report data + """ + logger.debug("Processing docs and creating report data") + # Initialize and load data input_doc_list = self.data.get('docs', []) - last_used = datetime.now() - docs = self.app_details.get("docs", []) - raw_data = {"total_findings": 0, "findings_entities": 0, "findings_topics": 0, - "data_source_count": 1, - "data_source_snippets": list(), "loader_source_snippets": {}, "file_count": 0, - "snippet_count": 0, "data_source_findings": {}, "snippet_counter": 0, - "total_snippet_counter": 0} - loader_source_files = self.app_details.get("loader_source_files", []) + ai_app_docs = self.app_details.get("docs", []) + + # Initialize raw data + raw_data = self._initialize_raw_data() + logger.debug("Iterating input doc list and perform classification and aggregating report data") for doc in input_doc_list: - # Get classifier Response if doc: + # Get classifier Response doc_info: AiDataModel = self._get_classifier_response(doc) - doc_model = AiDocs(appId=self.load_id, - doc=doc.get('doc'), - sourceSize=doc.get('source_path_size', 0), - fileOwner=doc.get('file_owner', '-'), - sourcePath=doc.get('source_path'), - loaderSourcePath=loader_details.get("source_path"), - lastModified=last_used, - entityCount=doc_info.entityCount, - entities=doc_info.entities, - topicCount=doc_info.topicCount, - topics=doc_info.topics) - docs.append(doc_model.dict()) - raw_data = self._get_doc_report_metadata(doc_model.dict(), raw_data) + doc_obj = self._create_doc_model(doc, doc_info) + ai_app_docs.append(doc_obj) + raw_data = self._get_doc_report_metadata(doc_obj, raw_data) - # Updating app_details doc list and loader source files - loader_source_snippets = raw_data["loader_source_snippets"] - self.app_details["docs"] = docs - - new_loader_source_files = [ - { - "name": key, - "findings_entities": value['findings_entities'], - "findings_topics": value['findings_topics'], - "findings": value['findings'] - } - for key, value in loader_source_snippets.items() - ] - loader_source_files.extend(new_loader_source_files) - self.app_details["loader_source_files"] = loader_source_files + # Updating ai apps details + self._update_app_details(raw_data, ai_app_docs) # Generate Final Report final_report = self._generate_final_report(raw_data) diff --git a/pebblo/app/service/service.py b/pebblo/app/service/service.py index 9725eb68..80373ccd 100644 --- a/pebblo/app/service/service.py +++ b/pebblo/app/service/service.py @@ -1,182 +1,140 @@ import logging from datetime import datetime +from fastapi import HTTPException +from pydantic import ValidationError + from pebblo.reports.reports import Reports from pebblo.app.enums.enums import CacheDir from pebblo.app.utils.utils import write_json_to_file, read_json_file, get_full_path -from pebblo.app.models.models import LoaderMetadata, Metadata, AiApp, InstanceDetails -from pebblo.app.service.doc_helper import DocHelper from pebblo.app.libs.logger import logger -from pydantic import ValidationError -from fastapi import HTTPException - - -class AppDiscover: - def __init__(self, data: dict): - self.data = data - self.load_id = data.get('load_id') - - def process_request(self): - """ - Process AI App discovery Request - """ - try: - application_name = self.data.get("name") - owner = self.data.get("owner") - logger.debug(f"AI_APP [{application_name}]: Input Data: {self.data}") - - # Writing app metadata file - file_context = {"name": application_name, "current_load_id": self.load_id} - file_path = f"{CacheDir.home_dir.value}/{application_name}/{CacheDir.metadata_file_path.value}" - write_json_to_file(file_context, file_path) - - # getting instance details - runtime_dict = self.data.get("runtime", {}) - instance_details_model = InstanceDetails( - language=runtime_dict.get("language"), - languageVersion=runtime_dict.get("language_version"), - host=runtime_dict.get("host"), - ip=runtime_dict.get("ip"), - path=runtime_dict.get("path"), - runtime=runtime_dict.get("runtime"), - type=runtime_dict.get("type"), - platform=runtime_dict.get("platform"), - os=runtime_dict.get("os"), - osVersion=runtime_dict.get("os_version") - ) - logger.debug(f"AI_APPS [{application_name}]: Instance Details: {instance_details_model.dict()}") - - last_used = datetime.now() - metadata = Metadata( - createdAt=datetime.now(), - modifiedAt=datetime.now() - ) - - ai_apps_model = AiApp( - metadata=metadata, - name=application_name, - description=self.data.get("description", " "), - owner=owner, - pluginVersion=self.data.get("plugin_version"), - instanceDetails=instance_details_model, - framework=self.data.get("framework"), - lastUsed=last_used - ) - logger.debug(f"Final Output For Discovery Call: {ai_apps_model.dict()}") - file_path = f"{CacheDir.home_dir.value}/{application_name}/{self.load_id}/{CacheDir.metadata_file_path.value}" - write_json_to_file(ai_apps_model.dict(), file_path) - logger.info("App Discover Request Processed Successfully") - return {"message": "App Discover Request Processed Successfully"} - except ValidationError as ex: - logger.error(f"Error in process_request. Error:{ex}") - raise HTTPException(status_code=400, detail=str(ex)) - except Exception as ex: - logger.error(f"Error in process_request. Error:{ex}") - raise HTTPException(status_code=500, detail="Internal Server Error") +from pebblo.app.models.models import LoaderMetadata +from pebblo.app.service.doc_helper import LoaderHelper class AppLoaderDoc: def __init__(self, data): self.data = data + self.app_name = self.data.get("name") - def process_request(self): - """This process is entrypoint function for loader doc API implementation.""" - logger.debug(f"Loader Doc, Input Data: {self.data}") + def _write_pdf_report(self, final_report): + """ + Calling pdf report generator to write report in pdf format + """ + logger.debug("Generating report in pdf format") + report_obj = Reports() + report_format = CacheDir.format.value + renderer = CacheDir.renderer.value + + # Writing pdf report to current load id directory + load_id = self.data['load_id'] + current_load_report_file_path = (f"{CacheDir.home_dir.value}/{self.app_name}" + f"/{load_id}/{CacheDir.report_file_name.value}") + full_file_path = get_full_path(current_load_report_file_path) + report_obj.generate_report(data=final_report, outputPath=full_file_path, format=report_format, + renderer=renderer) + + # Writing pdf report file specific to application name, inside app directory + current_app_report_file_path = (f"{CacheDir.home_dir.value}/{self.app_name}" + f"/{CacheDir.report_file_name.value}") + full_file_path = get_full_path(current_app_report_file_path) + report_obj.generate_report(data=final_report, outputPath=full_file_path, format=report_format, + renderer=renderer) + logger.info(f"PDF report generated, please check path : {full_file_path}") + + def _upsert_loader_details(self, app_details): + """ + Update loader details in the application if they already exist; + otherwise, add loader details to the application. + """ + logger.debug("Upsert loader details to exiting ai app details") + # Update loader details if it already exits in app + loader_details = self.data.get("loader_details", {}) + loader_name = loader_details.get('loader', None) + source_type = loader_details.get('source_type', None) + source_path = loader_details.get('source_path', None) + loader_source_files = loader_details.get("source_files", []) + if loader_details.get("source_path_size") is not None: + source_size = loader_details.get("source_path_size", 0) + else: + source_size = loader_details.get("source_aggr_size", 0) + + # Checking for same loader details in app details + if loader_name and source_type: + loader_list = app_details.get('loaders', []) + loader_exist = False + for loader in loader_list: + # If loader exist, update loader SourcePath and SourceType + if loader and loader.get('name', "") == loader_name: + loader['sourcePath'] = source_path + loader['sourceType'] = source_type + loader['sourceSize'] = source_size + loader["sourceFiles"].extend(loader_source_files) + loader['lastModified'] = datetime.now() + loader_exist = True + + # If loader does not exist, create new entry + if not loader_exist: + logger.debug("loader details does not exist in app details, adding details to app details") + new_loader_data = LoaderMetadata(name=loader_name, + sourcePath=source_path, + sourceType=source_type, + sourceSize=source_size, + sourceFiles=loader_source_files, + lastModified=datetime.now()) + loader_list.append(new_loader_data.dict()) + app_details["loaders"] = loader_list + def process_request(self): + """ + This process is entrypoint function for loader doc API implementation. + """ try: - app_name = self.data.get("name") - logger.debug(f"AI Loader Doc, AppName: {app_name}") - - report_format = CacheDir.format.value - renderer = CacheDir.renderer.value + logger.debug("Loader doc request processing started") + logger.debug(f"Loader Doc, Application Name: {self.app_name}, Input Data: {self.data}") # Read metadata file & get current load details - app_metadata_file_path = f"{CacheDir.home_dir.value}/{app_name}/{CacheDir.metadata_file_path.value}" + app_metadata_file_path = f"{CacheDir.home_dir.value}/{self.app_name}/{CacheDir.metadata_file_path.value}" app_metadata = read_json_file(app_metadata_file_path) if not app_metadata: - return {"Message": "App details not present, Please call discovery api first"} - - prev_load_id = app_metadata.get("current_load_id") - load_id = self.data['load_id'] + return {"Message": "App details not present, Please execute discovery api first"} # Get current app details from load id - report_file_path = f"{CacheDir.home_dir.value}/{app_name}/{load_id}/{CacheDir.report_data_file_name.value}" - app_load_metadata_file_path = f"{CacheDir.home_dir.value}/{app_name}/{load_id}/{CacheDir.metadata_file_path.value}" + load_id = self.data['load_id'] + app_load_metadata_file_path = (f"{CacheDir.home_dir.value}/{self.app_name}" + f"/{load_id}/{CacheDir.metadata_file_path.value}") app_details = read_json_file(app_load_metadata_file_path) if not app_details: # TODO: Handle the case where discover call did not happen, but loader doc is being called. - logger.error("Could not read metadata file. Exiting.") - return - - # Get Loader Details from input - loader_details = self.data.get("loader_details", {}) - loader_name = loader_details.get('loader', None) - source_type = loader_details.get('source_type', None) - source_path = loader_details.get('source_path', None) - if loader_details.get("source_path_size") is not None: - source_size = loader_details.get("source_path_size", 0) - else: - source_size = loader_details.get("source_aggr_size", 0) - - # Checking for same loader details in app details - if loader_name and source_type: - loader_list = app_details.get('loaders', []) - loader_exist = False - for loader in loader_list: - # If loader exist, update loader SourcePath and SourceType - if loader and loader.get('name', "") == loader_name: - loader['sourcePath'] = source_path - loader['sourceType'] = source_type - loader['sourceSize'] = source_size - loader['lastModified'] = datetime.now() - loader_exist = True - - # If loader does not exist, create new entry - if not loader_exist: - logger.debug("loader not exist in app details") - new_loader_data = LoaderMetadata(name=loader_name, - sourcePath=source_path, - sourceType=source_type, - sourceSize=source_size, - lastModified=datetime.now()) - loader_list.append(new_loader_data.dict()) - app_details["loaders"] = loader_list - - # Fetching doc details from input & app details & generate final report - doc_helper_obj = DocHelper(app_details, self.data, load_id) - app_details, final_report = doc_helper_obj.process_docs_and_generate_report() + logger.error(f"Could not read metadata file at {app_load_metadata_file_path}. Exiting.") + return {"Message": f"Could not read metadata file at {app_load_metadata_file_path}. Exiting"} + + # Add/Update Loader Details with input loader details + self._upsert_loader_details(app_details) + + # process input docs, app details, and generate final report + loader_helper_obj = LoaderHelper(app_details, self.data, load_id) + app_details, final_report = loader_helper_obj.process_docs_and_generate_report() + logger.debug(f"Final Report with doc details: {final_report}") - # Write current state to the file. - write_json_to_file(app_details, app_load_metadata_file_path) # app_details - # This write will overwrite app_discovery Report + # Write current state to the file, Updating app details + write_json_to_file(app_details, app_load_metadata_file_path) + + # check whether report generation is necessary loading_end = self.data['loading_end'] if loading_end: logger.debug("Loading finished, generating report") - # writing json report as well for now - write_json_to_file(final_report, report_file_path) - - logger.debug("Generating report in pdf format") - report_obj = Reports() - - # Writing pdf report to current load id directory - load_id = self.data['load_id'] - current_load_report_file_path = (f"{CacheDir.home_dir.value}/{app_name}" - f"/{load_id}/{CacheDir.report_file_name.value}") - full_file_path = get_full_path(current_load_report_file_path) - report_obj.generate_report(data=final_report, outputPath=full_file_path, format=report_format, - renderer=renderer) - - # Writing pdf report file specific to application name, inside app directory - current_app_report_file_path = (f"{CacheDir.home_dir.value}/{app_name}" - f"/{CacheDir.report_file_name.value}") - full_file_path = get_full_path(current_app_report_file_path) - - report_obj.generate_report(data=final_report, outputPath=full_file_path, format=report_format, - renderer=renderer) - logger.info(f"PDF report generated at : {full_file_path}") - - logger.info("Loader Doc request Request processed successfully.") + + # writing report file to its load_id directory + json_report_file_path = (f"{CacheDir.home_dir.value}/{self.app_name}" + f"/{load_id}/{CacheDir.report_data_file_name.value}") + write_json_to_file(final_report, json_report_file_path) + + # Writing report in pdf format + self._write_pdf_report(final_report) + + logger.debug("Loader Doc request Request processed successfully.") return {"message": "Loader Doc API Request processed successfully"} except ValidationError as ex: logger.error(f"AI_LOADER_DOC Failed. Error:{ex}") diff --git a/pebblo/app/utils/utils.py b/pebblo/app/utils/utils.py index b6b27876..ae50180a 100644 --- a/pebblo/app/utils/utils.py +++ b/pebblo/app/utils/utils.py @@ -13,7 +13,12 @@ def default(self, obj): def write_json_to_file(data, file_path): + """ + Write content to the specified file path + """ try: + # Writing file content to given file path + logger.debug(f"Writing content to file path: {file_path}") full_file_path = get_full_path(file_path) # Create parent directories if needed dir_path = path.dirname(full_file_path) @@ -26,6 +31,10 @@ def write_json_to_file(data, file_path): def read_json_file(file_path): + """ + Retrieve the content of the specified file. + """ + logger.debug(f"Reading content from file: {file_path}") full_file_path = "" try: full_file_path = get_full_path(file_path) diff --git a/pebblo/reports/html_to_pdf_generator/report_generator.py b/pebblo/reports/html_to_pdf_generator/report_generator.py index 9c692dc7..aaf9139c 100644 --- a/pebblo/reports/html_to_pdf_generator/report_generator.py +++ b/pebblo/reports/html_to_pdf_generator/report_generator.py @@ -25,6 +25,6 @@ def convertHtmlToPdf(data, outputPath, templateName, searchPath, renderer): templateLoader = jinja2.FileSystemLoader(searchpath=searchPath) templateEnv = jinja2.Environment(loader=templateLoader) template = templateEnv.get_template(templateName) - sourceHtml = template.render(data=data, date=datetime.datetime.now(), datastores=data["dataSources"][0], findingDetails=data["dataSources"][0]["findingsDetails"], dateFormatter=dateFormatter, getFileSize=getFileSize) + sourceHtml = template.render(data=data, date=datetime.datetime.now(), datastores=data["dataSources"][0], findingDetails=data["dataSources"][0]["findingsDetails"], loadHistoryItemsToDisplay=data["loadHistory"]["history"][:5] , dateFormatter=dateFormatter, getFileSize=getFileSize) pdfConverter = library_function_mapping[renderer] pdfConverter(sourceHtml, outputPath, searchPath) diff --git a/pebblo/reports/templates/weasyprintTemplate.html b/pebblo/reports/templates/weasyprintTemplate.html index 140af428..58d05cc8 100644 --- a/pebblo/reports/templates/weasyprintTemplate.html +++ b/pebblo/reports/templates/weasyprintTemplate.html @@ -61,7 +61,7 @@
- | -- | -- | -- | +{{ item.reportName }} | +{{ item.findings }} | +{{ item.filesWithFindings }} | +{{ dateFormatter(item.generatedOn) }} |