Skip to content

Commit

Permalink
Merge pull request #1 from amazingvince/main
Browse files Browse the repository at this point in the history
adding support for 3.7 and colab
  • Loading branch information
ankrgyl authored Aug 21, 2022
2 parents 9f804c0 + f1aea28 commit 13676dc
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 5 deletions.
57 changes: 57 additions & 0 deletions DocQA_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "DocQA_example.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyPo0xYtL+cYJfNl6lhkeDjX",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/amazingvince/docqa/blob/main/DocQA_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yS9UNjHnAAS9"
},
"outputs": [],
"source": [
"!git clone https://github.com/impira/docqa.git\n",
"!sudo apt install tesseract-ocr\n",
"!sudo apt-get install poppler-utils\n",
"!cd docqa && pip install .[all] "
]
},
{
"cell_type": "code",
"source": [
"!docqa scan \"who authored this paper?\" https://arxiv.org/pdf/2101.07597.pdf"
],
"metadata": {
"id": "bKRRY5u2DV52"
},
"execution_count": null,
"outputs": []
}
]
}
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@

install_requires = [
"torch >= 1.0",
"git+https://github.com/impira/transformers@153d1361c7dcc91c7735cae73e1f594cfcab3e21",
"transformers @ git+https://github.com/impira/transformers@153d1361c7dcc91c7735cae73e1f594cfcab3e21",
"pdf2image",
"pdfplumber",
"Pillow",
"pillow-simd",
"pydantic",
"pytesseract", # TODO: Test what happens if the host machine does not have tesseract installed
"requests",
Expand Down Expand Up @@ -59,7 +59,7 @@
],
package_dir={"": "src"},
packages=setuptools.find_packages(where="src"),
python_requires=">=3.8.0",
python_requires=">=3.7.0",
entry_points={
"console_scripts": ["docqa = docqa.cmd.__main__:main"],
},
Expand Down
13 changes: 11 additions & 2 deletions src/docqa/document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import abc
import logging
import os
from functools import cached_property

from io import BytesIO
from typing import List, Tuple

Expand All @@ -10,6 +11,13 @@
from .ext import transformers


try:
from functools import cached_property as cached_property
except ImportError:
#for python 3.7 support fall back to just property
cached_property = property


class UnsupportedDocument(Exception):
def __init__(self, e):
self.e = e
Expand Down Expand Up @@ -83,11 +91,12 @@ def apply_tesseract(*args, **kwargs):
return transformers.apply_tesseract(*args, **kwargs)


class Document:
class Document(metaclass=abc.ABCMeta):
def __init__(self, b):
self.b = b

@property
@abc.abstractmethod
def context(self) -> Tuple[(str, List[int])]:
raise NotImplementedError

Expand Down

0 comments on commit 13676dc

Please sign in to comment.