Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(dms): add file content search to community version [Silvertouch … #366

Open
wants to merge 1 commit into
base: 16.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dms/__manifest__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright 2024 Silvertouch technologies
# Copyright 2017-2019 MuK IT GmbH
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).

Expand Down
69 changes: 69 additions & 0 deletions dms/models/dms_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright 2024 Silvertouch technologies
# Copyright 2020 Antoni Romera
# Copyright 2017-2019 MuK IT GmbH
# Copyright 2021 Tecnativa - Víctor Martínez
Expand All @@ -10,6 +11,13 @@
from collections import defaultdict

from PIL import Image
from pypdf import PdfReader
import pytesseract
from io import BytesIO
from pdf2image import convert_from_bytes
import re
import pandas as pd
from docx import Document

from odoo import _, api, fields, models, tools
from odoo.exceptions import UserError, ValidationError
Expand Down Expand Up @@ -135,6 +143,7 @@ class File(models.Model):
)

content_file = fields.Binary(attachment=True, prefetch=False, invisible=True)
index_content = fields.Text('Indexed Content')

# Extend inherited field(s)
image_1920 = fields.Image(compute="_compute_image_1920", store=True, readonly=False)
Expand Down Expand Up @@ -561,6 +570,66 @@ def _inverse_content(self):
values = self._get_content_inital_vals()
binary = base64.b64decode(record.content or "")
values = record._update_content_vals(values, binary)

if record.mimetype == 'application/pdf':
# Extract text from PDF
try:
pdf_reader = PdfReader(BytesIO(binary))
text_content = ''
for page in pdf_reader.pages:
page_text = page.extract_text()
# Remove null bytes and other unwanted characters from page text
page_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', page_text)
text_content += page_text
if text_content:
values['index_content'] = text_content
else:
pdf_data = base64.b64decode(record.content)
images = convert_from_bytes(pdf_data, dpi=300)
extracted_text = []
for image in images:
text = pytesseract.image_to_string(image)
extracted_text.append(text)
full_text = "\n".join(extracted_text)

values['index_content'] = full_text
except:
values['index_content'] = ''

elif record.mimetype == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
# Extract text from docx
try:
doc = Document(BytesIO(binary))
text_content = ''
for paragraph in doc.paragraphs:
text_content += paragraph.text + '\n'
values['index_content'] = text_content
except:
values['index_content'] = ''

elif record.mimetype == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
# Extract text from Excel
try:
df = pd.read_excel(BytesIO(binary))
data_paragraph = ''
for index, row in df.iterrows():
row_data = ' '.join(f'{column}: {row[column]}' for column in df.columns)
data_paragraph += f'Row {index + 1}: {row_data}\n'
data_paragraph = data_paragraph.strip()
values['index_content'] = data_paragraph
except:
values['index_content'] = ''

elif record.mimetype in ['image/png', 'image/jpeg', 'image/jpg', 'image/svg']:
# Extract text from Image
image_data = base64.b64decode(record.content)
image = Image.open(BytesIO(image_data))
text = pytesseract.image_to_string(image)
values['index_content'] = text

else:
values['index_content'] = ''

updates[tools.frozendict(values)].add(record.id)
with self.env.norecompute():
for vals, ids in updates.items():
Expand Down
3 changes: 3 additions & 0 deletions dms/views/dms_file.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--

Copyright 2024 Silvertouch technologies
Copyright 2017-2019 MuK IT GmbH
Copyright 2021 Tecnativa - Víctor Martínez
License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
Expand All @@ -13,6 +14,7 @@
<field name="arch" type="xml">
<search>
<field name="name" filter_domain="[('name','ilike',self)]" />
<field name="index_content" filter_domain="[('index_content', 'ilike', self)]" string="Indexed Content" />
<filter
string="All Files"
name="all"
Expand Down Expand Up @@ -417,6 +419,7 @@
mimetype="mimetype"
widget="preview_binary"
/>
<field name="index_content" invisible="1"/>
</group>
<group>
<field name="extension" />
Expand Down