diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/abnormal_state.json b/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/abnormal_state.json index 1c86c5e27bee..def1060e067d 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/abnormal_state.json +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/abnormal_state.json @@ -6,10 +6,13 @@ "name": "test_csv" }, "stream_state": { - "_ab_source_file_last_modified": "2023-12-23T06:49:25.000000Z_Test_folder_2/TestFileOneDrive.csv", "history": { - "Test_folder_2/TestFileOneDrive.csv": "2023-12-23T06:49:25.000000Z" - } + "https://airbyte179.sharepoint.com/Shared%20Documents/Test_folder/TestFile.csv": "2023-11-17T13:52:35.000000Z", + "Test_folder_2/TestFileOneDrive.csv": "2023-11-23T06:49:25.000000Z", + "https://airbyte179.sharepoint.com/Shared%20Documents/Test_folder/Test_folder_2/TestFileSharePoint.csv": "2023-12-15T17:34:08.000000Z", + "https://airbyte179.sharepoint.com/Shared%20Documents/Test_folder/simple_test.csv": "2024-01-16T12:45:20.000000Z" + }, + "_ab_source_file_last_modified": "2024-01-16T12:45:20.000000Z_https://airbyte179.sharepoint.com/Shared%20Documents/Test_folder/simple_test.csv" } } }, @@ -20,10 +23,11 @@ "name": "test_unstructured" }, "stream_state": { - "_ab_source_file_last_modified": "2023-12-23T06:49:25.000000Z_simple_pdf_file.pdf", "history": { - "simple_pdf_file.pdf": "2023-12-23T06:49:25.000000Z" - } + "simple_pdf_file.pdf": "2023-12-13T11:24:38.000000Z", + "https://airbyte179.sharepoint.com/Shared%20Documents/Test_folder/Test_foler_2_1/simple_pdf_file.pdf": "2023-12-15T16:47:21.000000Z" + }, + "_ab_source_file_last_modified": "2023-12-15T16:47:21.000000Z_https://airbyte179.sharepoint.com/Shared%20Documents/Test_folder/Test_foler_2_1/simple_pdf_file.pdf" } } } diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/spec.json b/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/spec.json index 24ee82201f5a..e94d919d9493 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/integration_tests/spec.json @@ -52,7 +52,8 @@ }, "primary_key": { "title": "Primary Key", - "description": "The column or columns (for a composite key) that serves as the unique identifier of a record.", + "description": "The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.", + "airbyte_hidden": true, "type": "string" }, "days_to_sync_if_history_is_full": { @@ -274,12 +275,46 @@ "const": "unstructured", "type": "string" }, - "skip_unprocessable_file_types": { - "title": "Skip Unprocessable File Types", - "description": "If true, skip files that cannot be parsed because of their file type and log a warning. If false, fail the sync. Corrupted files with valid file types will still result in a failed sync.", + "skip_unprocessable_files": { + "title": "Skip Unprocessable Files", + "description": "If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.", "default": true, "always_show": true, "type": "boolean" + }, + "strategy": { + "title": "Parsing Strategy", + "description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf", + "default": "auto", + "always_show": true, + "order": 0, + "enum": ["auto", "fast", "ocr_only", "hi_res"], + "type": "string" + }, + "processing": { + "title": "Processing", + "description": "Processing configuration", + "default": { + "mode": "local" + }, + "type": "object", + "oneOf": [ + { + "title": "Local", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "local", + "const": "local", + "enum": ["local"], + "type": "string" + } + }, + "description": "Process files locally, supporting `fast` and `ocr` modes. This is the default option.", + "required": ["mode"] + } + ] } }, "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.", @@ -400,20 +435,39 @@ "order": 2, "type": "string" }, + "search_scope": { + "title": "Search Scope", + "description": "Specifies the location(s) to search for files. Valid options are 'ACCESSIBLE_DRIVES' to search in the selected OneDrive drive, 'SHARED_ITEMS' for shared items the user has access to, and 'ALL' to search both.", + "default": "ALL", + "enum": ["ACCESSIBLE_DRIVES", "SHARED_ITEMS", "ALL"], + "order": 3, + "type": "string" + }, "folder_path": { "title": "Folder Path", - "description": "Path to folder of the Microsoft OneDrive drive where the file(s) exist.", - "order": 3, + "description": "Path to a specific folder within the drives to search for files. Leave empty to search all folders of the drives. This does not apply to shared items.", + "default": ".", + "order": 4, "type": "string" } }, - "required": ["streams", "credentials", "folder_path"] + "required": ["streams", "credentials"] }, "advanced_auth": { "auth_flow_type": "oauth2.0", "predicate_key": ["credentials", "auth_type"], "predicate_value": "Client", "oauth_config_specification": { + "oauth_user_input_from_connector_config_specification": { + "type": "object", + "additionalProperties": false, + "properties": { + "tenant_id": { + "type": "string", + "path_in_connector_config": ["credentials", "tenant_id"] + } + } + }, "complete_oauth_output_specification": { "type": "object", "additionalProperties": false, diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/metadata.yaml b/airbyte-integrations/connectors/source-microsoft-onedrive/metadata.yaml index ec2d958e7e74..c84600ea455e 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/metadata.yaml +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/metadata.yaml @@ -20,7 +20,7 @@ data: connectorSubtype: file connectorType: source definitionId: 01d1c685-fd4a-4837-8f4c-93fe5a0d2188 - dockerImageTag: 0.1.9 + dockerImageTag: 0.2.0 dockerRepository: airbyte/source-microsoft-onedrive githubIssueLabel: source-microsoft-onedrive icon: microsoft-onedrive.svg diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/poetry.lock b/airbyte-integrations/connectors/source-microsoft-onedrive/poetry.lock index 9c0383c5574a..8709d07711bb 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/poetry.lock +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/poetry.lock @@ -1,14 +1,14 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "airbyte-cdk" -version = "0.61.0" +version = "0.71.0" description = "A framework for writing Airbyte Connectors." optional = false python-versions = ">=3.8" files = [ - {file = "airbyte-cdk-0.61.0.tar.gz", hash = "sha256:8beda008c5a177041ac02860a431ce7b1ecd00062a4a8f31fe6ac446cbed3e70"}, - {file = "airbyte_cdk-0.61.0-py3-none-any.whl", hash = "sha256:3f989bfe692c9519d61f9120ddb744ab82c432c2caf25374d4d6f5cdc374a1e9"}, + {file = "airbyte-cdk-0.71.0.tar.gz", hash = "sha256:110959840681b770e9378f9bcbca7a4b50c75b11de74e9fb809112407c4f50fa"}, + {file = "airbyte_cdk-0.71.0-py3-none-any.whl", hash = "sha256:730365365e826311d88dc0a8a5ebbd6227cc41b3dc342ef1525061b6d93f889c"}, ] [package.dependencies] @@ -28,7 +28,7 @@ markdown = {version = "*", optional = true, markers = "extra == \"file-based\""} pdf2image = {version = "1.16.3", optional = true, markers = "extra == \"file-based\""} "pdfminer.six" = {version = "20221105", optional = true, markers = "extra == \"file-based\""} pendulum = "<3.0.0" -pyarrow = {version = "12.0.1", optional = true, markers = "extra == \"file-based\""} +pyarrow = {version = ">=15.0.0,<15.1.0", optional = true, markers = "extra == \"file-based\""} pydantic = ">=1.10.8,<2.0.0" pyrate-limiter = ">=3.1.0,<3.2.0" pytesseract = {version = "0.3.10", optional = true, markers = "extra == \"file-based\""} @@ -44,8 +44,8 @@ unstructured = [ wcmatch = "8.4" [package.extras] -dev = ["avro (>=1.11.2,<1.12.0)", "cohere (==4.21)", "fastavro (>=1.8.0,<1.9.0)", "freezegun", "langchain (==0.0.271)", "markdown", "mypy", "openai[embeddings] (==0.27.9)", "pandas (==2.0.3)", "pdf2image (==1.16.3)", "pdfminer.six (==20221105)", "pyarrow (==12.0.1)", "pytesseract (==0.3.10)", "pytest", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests-mock", "tiktoken (==0.4.0)", "unstructured (==0.10.27)", "unstructured.pytesseract (>=0.3.12)", "unstructured[docx,pptx] (==0.10.27)"] -file-based = ["avro (>=1.11.2,<1.12.0)", "fastavro (>=1.8.0,<1.9.0)", "markdown", "pdf2image (==1.16.3)", "pdfminer.six (==20221105)", "pyarrow (==12.0.1)", "pytesseract (==0.3.10)", "unstructured (==0.10.27)", "unstructured.pytesseract (>=0.3.12)", "unstructured[docx,pptx] (==0.10.27)"] +dev = ["avro (>=1.11.2,<1.12.0)", "cohere (==4.21)", "fastavro (>=1.8.0,<1.9.0)", "freezegun", "langchain (==0.0.271)", "markdown", "mypy", "openai[embeddings] (==0.27.9)", "pandas (==2.0.3)", "pdf2image (==1.16.3)", "pdfminer.six (==20221105)", "pyarrow (>=15.0.0,<15.1.0)", "pytesseract (==0.3.10)", "pytest", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests-mock", "tiktoken (==0.4.0)", "unstructured (==0.10.27)", "unstructured.pytesseract (>=0.3.12)", "unstructured[docx,pptx] (==0.10.27)"] +file-based = ["avro (>=1.11.2,<1.12.0)", "fastavro (>=1.8.0,<1.9.0)", "markdown", "pdf2image (==1.16.3)", "pdfminer.six (==20221105)", "pyarrow (>=15.0.0,<15.1.0)", "pytesseract (==0.3.10)", "unstructured (==0.10.27)", "unstructured.pytesseract (>=0.3.12)", "unstructured[docx,pptx] (==0.10.27)"] sphinx-docs = ["Sphinx (>=4.2,<5.0)", "sphinx-rtd-theme (>=1.0,<2.0)"] vector-db-based = ["cohere (==4.21)", "langchain (==0.0.271)", "openai[embeddings] (==0.27.9)", "tiktoken (==0.4.0)"] @@ -595,13 +595,13 @@ files = [ [[package]] name = "fsspec" -version = "2024.2.0" +version = "2024.3.0" description = "File-system specification" optional = false python-versions = ">=3.8" files = [ - {file = "fsspec-2024.2.0-py3-none-any.whl", hash = "sha256:817f969556fa5916bc682e02ca2045f96ff7f586d45110fcb76022063ad2c7d8"}, - {file = "fsspec-2024.2.0.tar.gz", hash = "sha256:b6ad1a679f760dda52b1168c859d01b7b80648ea6f7f7c7f5a8a91dc3f3ecb84"}, + {file = "fsspec-2024.3.0-py3-none-any.whl", hash = "sha256:779001bd0122c9c4975cf03827d5e86c3afb914a3ae27040f15d341ab506a693"}, + {file = "fsspec-2024.3.0.tar.gz", hash = "sha256:f13a130c0ed07e15c4e1aeb0472a823e9c426b0b5792a1f40d902b0a71972d43"}, ] [package.extras] @@ -895,13 +895,13 @@ source = ["Cython (>=3.0.7)"] [[package]] name = "markdown" -version = "3.5.2" +version = "3.6" description = "Python implementation of John Gruber's Markdown." optional = false python-versions = ">=3.8" files = [ - {file = "Markdown-3.5.2-py3-none-any.whl", hash = "sha256:d43323865d89fc0cb9b20c75fc8ad313af307cc087e84b657d9eec768eddeadd"}, - {file = "Markdown-3.5.2.tar.gz", hash = "sha256:e1ac7b3dc550ee80e602e71c1d168002f062e49f1b11e26a36264dafd4df2ef8"}, + {file = "Markdown-3.6-py3-none-any.whl", hash = "sha256:48f276f4d8cfb8ce6527c8f79e2ee29708508bf4d40aa410fbc3b4ee832c850f"}, + {file = "Markdown-3.6.tar.gz", hash = "sha256:ed4f41f6daecbeeb96e576ce414c41d2d876daa9a16cb35fa8ed8c2ddfad0224"}, ] [package.dependencies] @@ -1326,40 +1326,51 @@ files = [ [[package]] name = "pyarrow" -version = "12.0.1" +version = "15.0.1" description = "Python library for Apache Arrow" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pyarrow-12.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:6d288029a94a9bb5407ceebdd7110ba398a00412c5b0155ee9813a40d246c5df"}, - {file = "pyarrow-12.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345e1828efdbd9aa4d4de7d5676778aba384a2c3add896d995b23d368e60e5af"}, - {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d6009fdf8986332b2169314da482baed47ac053311c8934ac6651e614deacd6"}, - {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d3c4cbbf81e6dd23fe921bc91dc4619ea3b79bc58ef10bce0f49bdafb103daf"}, - {file = "pyarrow-12.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:cdacf515ec276709ac8042c7d9bd5be83b4f5f39c6c037a17a60d7ebfd92c890"}, - {file = "pyarrow-12.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:749be7fd2ff260683f9cc739cb862fb11be376de965a2a8ccbf2693b098db6c7"}, - {file = "pyarrow-12.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6895b5fb74289d055c43db3af0de6e16b07586c45763cb5e558d38b86a91e3a7"}, - {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1887bdae17ec3b4c046fcf19951e71b6a619f39fa674f9881216173566c8f718"}, - {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2c9cb8eeabbadf5fcfc3d1ddea616c7ce893db2ce4dcef0ac13b099ad7ca082"}, - {file = "pyarrow-12.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ce4aebdf412bd0eeb800d8e47db854f9f9f7e2f5a0220440acf219ddfddd4f63"}, - {file = "pyarrow-12.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:e0d8730c7f6e893f6db5d5b86eda42c0a130842d101992b581e2138e4d5663d3"}, - {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43364daec02f69fec89d2315f7fbfbeec956e0d991cbbef471681bd77875c40f"}, - {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:051f9f5ccf585f12d7de836e50965b3c235542cc896959320d9776ab93f3b33d"}, - {file = "pyarrow-12.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:be2757e9275875d2a9c6e6052ac7957fbbfc7bc7370e4a036a9b893e96fedaba"}, - {file = "pyarrow-12.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:cf812306d66f40f69e684300f7af5111c11f6e0d89d6b733e05a3de44961529d"}, - {file = "pyarrow-12.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:459a1c0ed2d68671188b2118c63bac91eaef6fc150c77ddd8a583e3c795737bf"}, - {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85e705e33eaf666bbe508a16fd5ba27ca061e177916b7a317ba5a51bee43384c"}, - {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9120c3eb2b1f6f516a3b7a9714ed860882d9ef98c4b17edcdc91d95b7528db60"}, - {file = "pyarrow-12.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c780f4dc40460015d80fcd6a6140de80b615349ed68ef9adb653fe351778c9b3"}, - {file = "pyarrow-12.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a3c63124fc26bf5f95f508f5d04e1ece8cc23a8b0af2a1e6ab2b1ec3fdc91b24"}, - {file = "pyarrow-12.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b13329f79fa4472324f8d32dc1b1216616d09bd1e77cfb13104dec5463632c36"}, - {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb656150d3d12ec1396f6dde542db1675a95c0cc8366d507347b0beed96e87ca"}, - {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6251e38470da97a5b2e00de5c6a049149f7b2bd62f12fa5dbb9ac674119ba71a"}, - {file = "pyarrow-12.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3de26da901216149ce086920547dfff5cd22818c9eab67ebc41e863a5883bac7"}, - {file = "pyarrow-12.0.1.tar.gz", hash = "sha256:cce317fc96e5b71107bf1f9f184d5e54e2bd14bbf3f9a3d62819961f0af86fec"}, + {file = "pyarrow-15.0.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:c2ddb3be5ea938c329a84171694fc230b241ce1b6b0ff1a0280509af51c375fa"}, + {file = "pyarrow-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7543ea88a0ff72f8e6baaf9bfdbec2c62aeabdbede9e4a571c71cc3bc43b6302"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1519e218a6941fc074e4501088d891afcb2adf77c236e03c34babcf3d6a0d1c7"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28cafa86e1944761970d3b3fc0411b14ff9b5c2b73cd22aaf470d7a3976335f5"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:be5c3d463e33d03eab496e1af7916b1d44001c08f0f458ad27dc16093a020638"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:47b1eda15d3aa3f49a07b1808648e1397e5dc6a80a30bf87faa8e2d02dad7ac3"}, + {file = "pyarrow-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e524a31be7db22deebbbcf242b189063ab9a7652c62471d296b31bc6e3cae77b"}, + {file = "pyarrow-15.0.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:a476fefe8bdd56122fb0d4881b785413e025858803cc1302d0d788d3522b374d"}, + {file = "pyarrow-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:309e6191be385f2e220586bfdb643f9bb21d7e1bc6dd0a6963dc538e347b2431"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83bc586903dbeb4365cbc72b602f99f70b96c5882e5dfac5278813c7d624ca3c"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07e652daac6d8b05280cd2af31c0fb61a4490ec6a53dc01588014d9fa3fdbee9"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:abad2e08652df153a72177ce20c897d083b0c4ebeec051239e2654ddf4d3c996"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cde663352bc83ad75ba7b3206e049ca1a69809223942362a8649e37bd22f9e3b"}, + {file = "pyarrow-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:1b6e237dd7a08482a8b8f3f6512d258d2460f182931832a8c6ef3953203d31e1"}, + {file = "pyarrow-15.0.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:7bd167536ee23192760b8c731d39b7cfd37914c27fd4582335ffd08450ff799d"}, + {file = "pyarrow-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c08bb31eb2984ba5c3747d375bb522e7e536b8b25b149c9cb5e1c49b0ccb736"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0f9c1d630ed2524bd1ddf28ec92780a7b599fd54704cd653519f7ff5aec177a"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5186048493395220550bca7b524420471aac2d77af831f584ce132680f55c3df"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:31dc30c7ec8958da3a3d9f31d6c3630429b2091ede0ecd0d989fd6bec129f0e4"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3f111a014fb8ac2297b43a74bf4495cc479a332908f7ee49cb7cbd50714cb0c1"}, + {file = "pyarrow-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a6d1f7c15d7f68f08490d0cb34611497c74285b8a6bbeab4ef3fc20117310983"}, + {file = "pyarrow-15.0.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:9ad931b996f51c2f978ed517b55cb3c6078272fb4ec579e3da5a8c14873b698d"}, + {file = "pyarrow-15.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:738f6b53ab1c2f66b2bde8a1d77e186aeaab702d849e0dfa1158c9e2c030add3"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c1c3fc16bc74e33bf8f1e5a212938ed8d88e902f372c4dac6b5bad328567d2f"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1fa92512128f6c1b8dde0468c1454dd70f3bff623970e370d52efd4d24fd0be"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:b4157f307c202cbbdac147d9b07447a281fa8e63494f7fc85081da351ec6ace9"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:b75e7da26f383787f80ad76143b44844ffa28648fcc7099a83df1538c078d2f2"}, + {file = "pyarrow-15.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:3a99eac76ae14096c209850935057b9e8ce97a78397c5cde8724674774f34e5d"}, + {file = "pyarrow-15.0.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:dd532d3177e031e9b2d2df19fd003d0cc0520d1747659fcabbd4d9bb87de508c"}, + {file = "pyarrow-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ce8c89848fd37e5313fc2ce601483038ee5566db96ba0808d5883b2e2e55dc53"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:862eac5e5f3b6477f7a92b2f27e560e1f4e5e9edfca9ea9da8a7478bb4abd5ce"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f0ea3a29cd5cb99bf14c1c4533eceaa00ea8fb580950fb5a89a5c771a994a4e"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb902f780cfd624b2e8fd8501fadab17618fdb548532620ef3d91312aaf0888a"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4f87757f02735a6bb4ad2e1b98279ac45d53b748d5baf52401516413007c6999"}, + {file = "pyarrow-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:efd3816c7fbfcbd406ac0f69873cebb052effd7cdc153ae5836d1b00845845d7"}, + {file = "pyarrow-15.0.1.tar.gz", hash = "sha256:21d812548d39d490e0c6928a7c663f37b96bf764034123d4b4ab4530ecc757a9"}, ] [package.dependencies] -numpy = ">=1.16.6" +numpy = ">=1.16.6,<2" [[package]] name = "pycparser" @@ -1672,7 +1683,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2115,18 +2125,18 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"] [[package]] name = "setuptools" -version = "69.1.1" +version = "69.2.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"}, - {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"}, + {file = "setuptools-69.2.0-py3-none-any.whl", hash = "sha256:c21c49fb1042386df081cb5d86759792ab89efca84cf114889191cd09aacc80c"}, + {file = "setuptools-69.2.0.tar.gz", hash = "sha256:0ff4183f8f42cd8fa3acea16c45205521a4ef28f73c6391d8a25e92893134f2e"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -2670,20 +2680,20 @@ files = [ [[package]] name = "zipp" -version = "3.17.0" +version = "3.18.1" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" files = [ - {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, - {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, + {file = "zipp-3.18.1-py3-none-any.whl", hash = "sha256:206f5a15f2af3dbaee80769fb7dc6f249695e940acca08dfb2a4769fe61e538b"}, + {file = "zipp-3.18.1.tar.gz", hash = "sha256:2884ed22e7d8961de1c9a05142eb69a247f120291bc0206a00a7642f09b5b715"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [metadata] lock-version = "2.0" python-versions = "^3.9,<3.12" -content-hash = "47d4d7c22bd95e2ee9c3543f485f76e05899c49e02fdbe1abcfff2cb8b5c93d1" +content-hash = "a681f27b61c03298e227dda475b28a81ba0d6a98cec9ea3b028dace04b992bcc" diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/pyproject.toml b/airbyte-integrations/connectors/source-microsoft-onedrive/pyproject.toml index 505f87187623..4f40e46627be 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/pyproject.toml +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",] build-backend = "poetry.core.masonry.api" [tool.poetry] -version = "0.1.9" +version = "0.2.0" name = "source-microsoft-onedrive" description = "Source implementation for Microsoft OneDrive." authors = [ "Airbyte ",] @@ -28,7 +28,7 @@ source-microsoft-onedrive = "source_microsoft_onedrive.run:run" [tool.poetry.dependencies.airbyte-cdk] extras = [ "file-based",] -version = "==0.61.0" +version = "^0" [tool.poetry.group.dev.dependencies] requests-mock = "^1.11.0" diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/spec.py b/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/spec.py index 180993a685c7..b5bc8890dacc 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/spec.py +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/spec.py @@ -89,8 +89,20 @@ class Config: drive_name: Optional[str] = Field( title="Drive Name", description="Name of the Microsoft OneDrive drive where the file(s) exist.", default="OneDrive", order=2 ) + + search_scope: str = Field( + title="Search Scope", + description="Specifies the location(s) to search for files. Valid options are 'ACCESSIBLE_DRIVES' to search in the selected OneDrive drive, 'SHARED_ITEMS' for shared items the user has access to, and 'ALL' to search both.", + default="ALL", + enum=["ACCESSIBLE_DRIVES", "SHARED_ITEMS", "ALL"], + order=3, + ) + folder_path: str = Field( - title="Folder Path", description="Path to folder of the Microsoft OneDrive drive where the file(s) exist.", order=3 + title="Folder Path", + description="Path to a specific folder within the drives to search for files. Leave empty to search all folders of the drives. This does not apply to shared items.", + order=4, + default=".", ) @classmethod diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/stream_reader.py b/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/stream_reader.py index 1fbd5d665bb9..d60985c838ed 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/stream_reader.py +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/source_microsoft_onedrive/stream_reader.py @@ -3,10 +3,12 @@ # import logging +from datetime import datetime from functools import lru_cache from io import IOBase -from typing import Iterable, List, Optional +from typing import Iterable, List, Optional, Tuple +import requests import smart_open from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode from airbyte_cdk.sources.file_based.remote_file import RemoteFile @@ -75,14 +77,30 @@ class SourceMicrosoftOneDriveStreamReader(AbstractFileBasedStreamReader): def __init__(self): super().__init__() + self._auth_client = None + self._one_drive_client = None @property def config(self) -> SourceMicrosoftOneDriveSpec: return self._config @property - def one_drive_client(self) -> SourceMicrosoftOneDriveSpec: - return SourceMicrosoftOneDriveClient(self._config).client + def auth_client(self): + # Lazy initialization of the auth_client + if self._auth_client is None: + self._auth_client = SourceMicrosoftOneDriveClient(self._config) + return self._auth_client + + @property + def one_drive_client(self): + # Lazy initialization of the one_drive_client + if self._one_drive_client is None: + self._one_drive_client = self.auth_client.client + return self._one_drive_client + + def get_access_token(self): + # Directly fetch a new access token from the auth_client each time it's called + return self.auth_client._get_access_token()["access_token"] @config.setter def config(self, value: SourceMicrosoftOneDriveSpec): @@ -95,6 +113,81 @@ def config(self, value: SourceMicrosoftOneDriveSpec): assert isinstance(value, SourceMicrosoftOneDriveSpec) self._config = value + @property + @lru_cache(maxsize=None) + def drives(self): + """ + Retrieves and caches OneDrive drives, including the user's drive based on authentication type. + """ + drives = self.one_drive_client.drives.get().execute_query() + + if self.config.credentials.auth_type == "Client": + my_drive = self.one_drive_client.me.drive.get().execute_query() + else: + my_drive = ( + self.one_drive_client.users.get_by_principal_name(self.config.credentials.user_principal_name).drive.get().execute_query() + ) + + drives.add_child(my_drive) + + # filter only onedrive drives + drives = list(filter(lambda drive: drive.drive_type in ["personal", "business"], drives)) + + return drives + + def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) -> List[Tuple[str, str, datetime]]: + """ + Retrieves a list of all nested files under the specified object. + Args: + drive_id: The ID of the drive containing the object. + object_id: The ID of the object to start the search from. + Returns: + A list of tuples containing file information (name, download URL, and last modified datetime). + Raises: + RuntimeError: If an error occurs during the request. + """ + + access_token = self.get_access_token() + headers = {"Authorization": f"Bearer {access_token}"} + base_url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}" + + def get_files(url: str, path: str) -> List[Tuple[str, str, datetime]]: + response = requests.get(url, headers=headers) + if response.status_code != 200: + error_info = response.json().get("error", {}).get("message", "No additional error information provided.") + raise RuntimeError(f"Failed to retrieve files from URL '{url}'. HTTP status: {response.status_code}. Error: {error_info}") + + data = response.json() + for child in data.get("value", []): + new_path = path + "/" + child["name"] + if child.get("file"): # Object is a file + last_modified = datetime.strptime(child["lastModifiedDateTime"], "%Y-%m-%dT%H:%M:%SZ") + yield (new_path, child["@microsoft.graph.downloadUrl"], last_modified) + else: # Object is a folder, retrieve children + child_url = f"{base_url}/items/{child['id']}/children" # Use item endpoint for nested objects + yield from get_files(child_url, new_path) + yield from [] + + # Initial request to item endpoint + item_url = f"{base_url}/items/{object_id}" + item_response = requests.get(item_url, headers=headers) + if item_response.status_code != 200: + error_info = item_response.json().get("error", {}).get("message", "No additional error information provided.") + raise RuntimeError( + f"Failed to retrieve the initial shared object with ID '{object_id}' from drive '{drive_id}'. " + f"HTTP status: {item_response.status_code}. Error: {error_info}" + ) + + # Check if the object is a file or a folder + item_data = item_response.json() + if item_data.get("file"): # Initial object is a file + new_path = path + "/" + item_data["name"] + last_modified = datetime.strptime(item_data["lastModifiedDateTime"], "%Y-%m-%dT%H:%M:%SZ") + yield (new_path, item_data["@microsoft.graph.downloadUrl"], last_modified) + else: + # Initial object is a folder, start file retrieval + yield from get_files(f"{item_url}/children", path) + def list_directories_and_files(self, root_folder, path=None): """Enumerates folders and files starting from a root folder.""" drive_items = root_folder.children.get().execute_query() @@ -102,48 +195,60 @@ def list_directories_and_files(self, root_folder, path=None): for item in drive_items: item_path = path + "/" + item.name if path else item.name if item.is_file: - found_items.append((item, item_path)) + found_items.append((item_path, item.properties["@microsoft.graph.downloadUrl"], item.properties["lastModifiedDateTime"])) else: found_items.extend(self.list_directories_and_files(item, item_path)) return found_items - def get_files_by_drive_name(self, drives, drive_name, folder_path): + def get_files_by_drive_name(self, drive_name, folder_path): """Yields files from the specified drive.""" path_levels = [level for level in folder_path.split("/") if level] folder_path = "/".join(path_levels) - for drive in drives: - is_onedrive = drive.drive_type in ["personal", "business"] - if drive.name == drive_name and is_onedrive: + for drive in self.drives: + if drive.name == drive_name: folder = drive.root if folder_path in self.ROOT_PATH else drive.root.get_by_path(folder_path).get().execute_query() yield from self.list_directories_and_files(folder) + def _get_shared_files_from_all_drives(self, parsed_drive_id: str): + shared_drive_items = self.one_drive_client.me.drive.shared_with_me().execute_query() + for drive_item in shared_drive_items: + parent_reference = drive_item.remote_item.parentReference + + # check if drive is already parsed + if parent_reference and parent_reference["driveId"] != parsed_drive_id: + yield from self._get_shared_drive_object(parent_reference["driveId"], drive_item.id, drive_item.web_url) + + def get_all_files(self): + if self.config.search_scope in ("ACCESSIBLE_DRIVES", "ALL"): + # Get files from accessible drives + yield from self.get_files_by_drive_name(self.config.drive_name, self.config.folder_path) + + if self.config.search_scope in ("SHARED_ITEMS", "ALL"): + selected_drive = list(filter(lambda drive: drive.name == self.config.drive_name, self.drives)) + selected_drive_id = selected_drive[0].id if selected_drive else None + + if self.config.search_scope == "SHARED_ITEMS": + selected_drive_id = None + + # Get files from shared items + yield from self._get_shared_files_from_all_drives(selected_drive_id) + def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]: """ Retrieve all files matching the specified glob patterns in OneDrive. """ - drives = self.one_drive_client.drives.get().execute_query() - - if self.config.credentials.auth_type == "Client": - my_drive = self.one_drive_client.me.drive.get().execute_query() - else: - my_drive = ( - self.one_drive_client.users.get_by_principal_name(self.config.credentials.user_principal_name).drive.get().execute_query() - ) - - drives.add_child(my_drive) - - files = self.get_files_by_drive_name(drives, self.config.drive_name, self.config.folder_path) + files = self.get_all_files() try: - first_file, path = next(files) + path, download_url, last_modified = next(files) yield from self.filter_files_by_globs_and_start_date( [ MicrosoftOneDriveRemoteFile( uri=path, - download_url=first_file.properties["@microsoft.graph.downloadUrl"], - last_modified=first_file.properties["lastModifiedDateTime"], + download_url=download_url, + last_modified=last_modified, ) ], globs, @@ -161,10 +266,10 @@ def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: lo [ MicrosoftOneDriveRemoteFile( uri=path, - download_url=file.properties["@microsoft.graph.downloadUrl"], - last_modified=file.properties["lastModifiedDateTime"], + download_url=download_url, + last_modified=last_modified, ) - for file, path in files + for path, download_url, last_modified in files ], globs, ) diff --git a/airbyte-integrations/connectors/source-microsoft-onedrive/unit_tests/unit_tests.py b/airbyte-integrations/connectors/source-microsoft-onedrive/unit_tests/unit_tests.py index f610ad67a646..f89fdd287081 100644 --- a/airbyte-integrations/connectors/source-microsoft-onedrive/unit_tests/unit_tests.py +++ b/airbyte-integrations/connectors/source-microsoft-onedrive/unit_tests/unit_tests.py @@ -2,8 +2,10 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from unittest.mock import Mock, patch +from datetime import datetime +from unittest.mock import MagicMock, Mock, PropertyMock, call, patch +import pytest from source_microsoft_onedrive.spec import SourceMicrosoftOneDriveSpec from source_microsoft_onedrive.stream_reader import FileReadMode, SourceMicrosoftOneDriveClient, SourceMicrosoftOneDriveStreamReader @@ -28,7 +30,7 @@ def test_open_file(mock_smart_open): with stream_reader.open_file(mock_file, FileReadMode.READ, "utf-8", mock_logger) as result: pass - mock_smart_open.assert_called_once_with(mock_file.download_url, mode='r', encoding='utf-8') + mock_smart_open.assert_called_once_with(mock_file.download_url, mode="r", encoding="utf-8") assert result is not None @@ -40,15 +42,18 @@ def test_microsoft_onedrive_client_initialization(requests_mock): "client_id": "client_id", "tenant_id": "tenant_id", "client_secret": "client_secret", - "refresh_token": "refresh_token" + "refresh_token": "refresh_token", }, "drive_name": "drive_name", "folder_path": "folder_path", - "streams": [{"name": "test_stream", "globs": ["*.csv"], "validation_policy": "Emit Record", "format": {"filetype": "csv"}}] + "streams": [{"name": "test_stream", "globs": ["*.csv"], "validation_policy": "Emit Record", "format": {"filetype": "csv"}}], } - authority_url = 'https://login.microsoftonline.com/tenant_id/v2.0/.well-known/openid-configuration' - mock_response = {'authorization_endpoint': 'https://login.microsoftonline.com/tenant_id/oauth2/v2.0/authorize', 'token_endpoint': 'https://login.microsoftonline.com/tenant_id/oauth2/v2.0/token'} + authority_url = "https://login.microsoftonline.com/tenant_id/v2.0/.well-known/openid-configuration" + mock_response = { + "authorization_endpoint": "https://login.microsoftonline.com/tenant_id/oauth2/v2.0/authorize", + "token_endpoint": "https://login.microsoftonline.com/tenant_id/oauth2/v2.0/token", + } requests_mock.get(authority_url, json=mock_response, status_code=200) client = SourceMicrosoftOneDriveClient(SourceMicrosoftOneDriveSpec(**config)) @@ -92,9 +97,199 @@ def test_get_files_by_drive_name(mock_list_directories_and_files): stream_reader = SourceMicrosoftOneDriveStreamReader() stream_reader._config = Mock() - # Call the method - files = list(stream_reader.get_files_by_drive_name([mock_drive], "testDrive", "/test/path")) + with patch.object(SourceMicrosoftOneDriveStreamReader, "drives", new_callable=PropertyMock) as mock_drives: + mock_drives.return_value = [mock_drive] + + # Call the method + files = list(stream_reader.get_files_by_drive_name("testDrive", "/test/path")) # Assertions assert len(files) == 1 assert files[0].name == "testFile.txt" + + +@pytest.mark.parametrize( + "selected_drive_id, drive_ids, shared_drive_item_dicts, expected_result, expected_calls", + [ + (None, [1, 2, 3], [], [], []), + (1, [1, 2, 3], [{"drive_id": 1, "id": 4, "web_url": "test_url4"}], [], []), + (1, [1, 2, 3], [{"drive_id": 4, "id": 4, "web_url": "test_url4"}], [4], [call(4, 4, "test_url4")]), + ( + 2, + [1, 2, 3], + [{"drive_id": 4, "id": 4, "web_url": "test_url4"}, {"drive_id": 5, "id": 5, "web_url": "test_url5"}], + [4, 5], + [call(4, 4, "test_url4"), call(5, 5, "test_url5")], + ), + ( + 3, + [1, 2, 3], + [ + {"drive_id": 4, "id": 4, "web_url": "test_url4"}, + {"drive_id": 5, "id": 5, "web_url": "test_url5"}, + {"drive_id": 6, "id": 6, "web_url": "test_url6"}, + ], + [4, 5, 6], + [call(4, 4, "test_url4"), call(5, 5, "test_url5"), call(6, 6, "test_url6")], + ), + ], +) +def test_get_shared_files_from_all_drives(selected_drive_id, drive_ids, shared_drive_item_dicts, expected_result, expected_calls): + stream_reader = SourceMicrosoftOneDriveStreamReader() + stream_reader._config = Mock() + + # Mock _get_shared_drive_object method + with patch.object( + SourceMicrosoftOneDriveStreamReader, "_get_shared_drive_object", return_value=expected_result + ) as mock_get_shared_drive_object: + # Setup shared_drive_items mock objects + shared_drive_items = [ + MagicMock(remote_item=MagicMock(parentReference={"driveId": item["drive_id"]}), id=item["id"], web_url=item["web_url"]) + for item in shared_drive_item_dicts + ] + + with patch.object(SourceMicrosoftOneDriveStreamReader, "one_drive_client", new_callable=PropertyMock) as mock_one_drive_client: + mock_one_drive_client.return_value.me.drive.shared_with_me.return_value.execute_query.return_value = shared_drive_items + + with patch.object(SourceMicrosoftOneDriveStreamReader, "drives", new_callable=PropertyMock) as mock_drives: + mock_drives.return_value = [Mock(id=drive_id) for drive_id in drive_ids] + + # Execute the method under test + list(stream_reader._get_shared_files_from_all_drives(selected_drive_id)) + + # Assert _get_shared_drive_object was called correctly + mock_get_shared_drive_object.assert_has_calls(expected_calls, any_order=True) + + +# Sample data for mocking responses +file_response = { + "file": True, + "name": "TestFile.txt", + "@microsoft.graph.downloadUrl": "http://example.com/download", + "lastModifiedDateTime": "2021-01-01T00:00:00Z", +} + +empty_folder_response = {"folder": True, "value": []} + +# Adjusting the folder_with_nested_files to represent the initial folder response +folder_with_nested_files_initial = { + "folder": True, + "value": [ + {"id": "subfolder1", "folder": True, "name": "subfolder1"}, + {"id": "subfolder2", "folder": True, "name": "subfolder2"}, + ], # Empty subfolder # Subfolder with a file +} + +# Response for the empty subfolder (subfolder1) +empty_subfolder_response = {"value": [], "name": "subfolder1"} # No files or folders inside subfolder1 + +# Response for the subfolder with a file (subfolder2) +not_empty_subfolder_response = { + "value": [ + { + "file": True, + "name": "NestedFile.txt", + "@microsoft.graph.downloadUrl": "http://example.com/nested", + "lastModifiedDateTime": "2021-01-02T00:00:00Z", + } + ], + "name": "subfolder2", +} + + +@pytest.mark.parametrize( + "initial_response, subsequent_responses, expected_result, raises_error, expected_error_message, initial_path", + [ + # Object ID is a file + ( + file_response, + [], + [ + ( + "http://example.com/TestFile.txt", + "http://example.com/download", + datetime.strptime("2021-01-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ"), + ) + ], + False, + None, + "http://example.com", + ), + # Object ID is an empty folder + (empty_folder_response, [empty_subfolder_response], [], False, None, "http://example.com"), + # Object ID is a folder with empty subfolders and files + ( + {"folder": True, "name": "root"}, # Initial folder response + [ + folder_with_nested_files_initial, + empty_subfolder_response, + not_empty_subfolder_response, + ], + [ + ( + "http://example.com/subfolder2/NestedFile.txt", + "http://example.com/nested", + datetime.strptime("2021-01-02T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ"), + ) + ], + False, + None, + "http://example.com", + ), + # Error response on initial request + ( + MagicMock(status_code=400, json=MagicMock(return_value={"error": {"message": "Bad Request"}})), + [], + [], + True, + "Failed to retrieve the initial shared object with ID 'dummy_object_id' from drive 'dummy_drive_id'. HTTP status: 400. Error: Bad Request", + "http://example.com", + ), + # Error response while iterating over nested + ( + {"folder": True, "name": "root"}, + [MagicMock(status_code=400, json=MagicMock(return_value={"error": {"message": "Bad Request"}}))], + [], + True, + ( + "Failed to retrieve files from URL " + "'https://graph.microsoft.com/v1.0/drives/dummy_drive_id/items/dummy_object_id/children'. " + "HTTP status: 400. Error: Bad Request" + ), + "http://example.com", + ), + ], +) +@patch("source_microsoft_onedrive.stream_reader.requests.get") +@patch("source_microsoft_onedrive.stream_reader.SourceMicrosoftOneDriveStreamReader.get_access_token") +def test_get_shared_drive_object( + mock_get_access_token, + mock_requests_get, + initial_response, + subsequent_responses, + expected_result, + raises_error, + expected_error_message, + initial_path, +): + mock_get_access_token.return_value = "dummy_access_token" + mock_responses = [ + initial_response + if isinstance(initial_response, MagicMock) + else MagicMock(status_code=200, json=MagicMock(return_value=initial_response)) + ] + for response in subsequent_responses: + mock_responses.append( + response if isinstance(response, MagicMock) else MagicMock(status_code=200, json=MagicMock(return_value=response)) + ) + mock_requests_get.side_effect = mock_responses + + reader = SourceMicrosoftOneDriveStreamReader() + + if raises_error: + with pytest.raises(RuntimeError) as exc_info: + list(reader._get_shared_drive_object("dummy_drive_id", "dummy_object_id", initial_path)) + assert str(exc_info.value) == expected_error_message + else: + result = list(reader._get_shared_drive_object("dummy_drive_id", "dummy_object_id", initial_path)) + assert result == expected_result diff --git a/docs/integrations/sources/microsoft-onedrive.md b/docs/integrations/sources/microsoft-onedrive.md index 7e74518c79e4..f5ac00ffed8b 100644 --- a/docs/integrations/sources/microsoft-onedrive.md +++ b/docs/integrations/sources/microsoft-onedrive.md @@ -21,16 +21,17 @@ This page contains the setup guide and reference information for the Microsoft O 3. On the Set up the source page, select **Microsoft OneDrive** from the Source type dropdown. 4. Enter the name for the Microsoft OneDrive connector. 5. Enter **Drive Name**. To find your drive name go to settings and at the top of setting menu you can find the name of your drive. -6. Enter **Folder Path**. -7. The **OAuth2.0** authorization method is selected by default. Click **Authenticate your Microsoft OneDrive account**. Log in and authorize your Microsoft account. -8. For **Start Date**, enter the date in YYYY-MM-DD format. The data added on and after this date will be replicated. -9. Add a stream: - 1. Write the **File Type** - 2. In the **Format** box, use the dropdown menu to select the format of the files you'd like to replicate. The supported formats are **CSV**, **Parquet**, **Avro** and **JSONL**. Toggling the **Optional fields** button within the **Format** box will allow you to enter additional configurations based on the selected format. For a detailed breakdown of these settings, refer to the [File Format section](#file-format-settings) below. - 3. Give a **Name** to the stream - 4. (Optional) - If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema). - 5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below. -10. Click **Set up source** +6. Select **Search Scope**. Specifies the location(s) to search for files. Valid options are 'ACCESSIBLE_DRIVES' to search in the selected OneDrive drive, 'SHARED_ITEMS' for shared items the user has access to, and 'ALL' to search both. Default value is 'ALL'. +7. Enter **Folder Path**. Leave empty to search all folders of the drives. This does not apply to shared items. +8. The **OAuth2.0** authorization method is selected by default. Click **Authenticate your Microsoft OneDrive account**. Log in and authorize your Microsoft account. +9. For **Start Date**, enter the date in YYYY-MM-DD format. The data added on and after this date will be replicated. +10. Add a stream: + 1. Write the **File Type** + 2. In the **Format** box, use the dropdown menu to select the format of the files you'd like to replicate. The supported formats are **CSV**, **Parquet**, **Avro** and **JSONL**. Toggling the **Optional fields** button within the **Format** box will allow you to enter additional configurations based on the selected format. For a detailed breakdown of these settings, refer to the [File Format section](#file-format-settings) below. + 3. Give a **Name** to the stream + 4. (Optional) - If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema). + 5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below. +11. Click **Set up source** @@ -80,18 +81,19 @@ This source requires **Application permissions**. Follow these [instructions](ht 3. On the **Set up** the source page, select **Microsoft OneDrive** from the Source type dropdown. 4. Enter the name for the Microsoft OneDrive connector. 5. Enter **Drive Name**. To find your drive name go to settings and at the top of setting menu you can find the name of your drive. -6. Enter **Folder Path**. -7. Switch to **Service Key Authentication** -8. For **User Practical Name**, enter the [UPN](https://learn.microsoft.com/en-us/sharepoint/list-onedrive-urls) for your user. -9. Enter **Tenant ID**, **Client ID** and **Client secret**. -10. For **Start Date**, enter the date in YYYY-MM-DD format. The data added on and after this date will be replicated. -11. Add a stream: +6. Select **Search Scope**. Specifies the location(s) to search for files. Valid options are 'ACCESSIBLE_DRIVES' to search in the selected OneDrive drive, 'SHARED_ITEMS' for shared items the user has access to, and 'ALL' to search both. Default value is 'ALL'. +7. Enter **Folder Path**. Leave empty to search all folders of the drives. This does not apply to shared items. +8. Switch to **Service Key Authentication** +9. For **User Practical Name**, enter the [UPN](https://learn.microsoft.com/en-us/sharepoint/list-onedrive-urls) for your user. +10. Enter **Tenant ID**, **Client ID** and **Client secret**. +11. For **Start Date**, enter the date in YYYY-MM-DD format. The data added on and after this date will be replicated. +12. Add a stream: 1. Write the **File Type** 2. In the **Format** box, use the dropdown menu to select the format of the files you'd like to replicate. The supported formats are **CSV**, **Parquet**, **Avro** and **JSONL**. Toggling the **Optional fields** button within the **Format** box will allow you to enter additional configurations based on the selected format. For a detailed breakdown of these settings, refer to the [File Format section](#file-format-settings) below. 3. Give a **Name** to the stream 4. (Optional) - If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema). 5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below. -12. Click **Set up source** +13. Click **Set up source** @@ -119,15 +121,16 @@ The connector is restricted by normal Microsoft Graph [requests limitation](http ## Changelog -| Version | Date | Pull Request | Subject | -|:--------|:-----------|:---------------------------------------------------------|:------------------------------------------------------------------------| -| 0.1.9 | 2024-03-11 | [35956](https://github.com/airbytehq/airbyte/pull/35956) | Pin `transformers` transitive dependency | -| 0.1.8 | 2024-03-06 | [35858](https://github.com/airbytehq/airbyte/pull/35858) | Bump poetry.lock to upgrade transitive dependency | -| 0.1.7 | 2024-03-0q | [34936](https://github.com/airbytehq/airbyte/pull/34936) | Enable in Cloud | -| 0.1.6 | 2024-02-06 | [34936](https://github.com/airbytehq/airbyte/pull/34936) | Bump CDK version to avoid missing SyncMode errors | -| 0.1.5 | 2024-01-30 | [34681](https://github.com/airbytehq/airbyte/pull/34681) | Unpin CDK version to make compatible with the Concurrent CDK | -| 0.1.4 | 2024-01-30 | [34661](https://github.com/airbytehq/airbyte/pull/34661) | Pin CDK version until upgrade for compatibility with the Concurrent CDK | -| 0.1.3 | 2024-01-24 | [34478](https://github.com/airbytehq/airbyte/pull/34478) | Fix OAuth | -| 0.1.2 | 2021-12-22 | [33745](https://github.com/airbytehq/airbyte/pull/33745) | Add ql and sl to metadata | -| 0.1.1 | 2021-12-15 | [33758](https://github.com/airbytehq/airbyte/pull/33758) | Fix for docs name | -| 0.1.0 | 2021-12-06 | [32655](https://github.com/airbytehq/airbyte/pull/32655) | New source | +| Version | Date | Pull Request | Subject | +|:--------|:-----------|:---------------------------------------------------------|:------------------------------------------------------------------------------------------------| +| 0.2.0 | 2024-03-12 | [35849](https://github.com/airbytehq/airbyte/pull/35849) | Add fetching shared items | +| 0.1.9 | 2024-03-11 | [35956](https://github.com/airbytehq/airbyte/pull/35956) | Pin `transformers` transitive dependency | +| 0.1.8 | 2024-03-06 | [35858](https://github.com/airbytehq/airbyte/pull/35858) | Bump poetry.lock to upgrade transitive dependency | +| 0.1.7 | 2024-03-04 | [35584](https://github.com/airbytehq/airbyte/pull/35584) | Enable in Cloud | +| 0.1.6 | 2024-02-06 | [34936](https://github.com/airbytehq/airbyte/pull/34936) | Bump CDK version to avoid missing SyncMode errors | +| 0.1.5 | 2024-01-30 | [34681](https://github.com/airbytehq/airbyte/pull/34681) | Unpin CDK version to make compatible with the Concurrent CDK | +| 0.1.4 | 2024-01-30 | [34661](https://github.com/airbytehq/airbyte/pull/34661) | Pin CDK version until upgrade for compatibility with the Concurrent CDK | +| 0.1.3 | 2024-01-24 | [34478](https://github.com/airbytehq/airbyte/pull/34478) | Fix OAuth | +| 0.1.2 | 2021-12-22 | [33745](https://github.com/airbytehq/airbyte/pull/33745) | Add ql and sl to metadata | +| 0.1.1 | 2021-12-15 | [33758](https://github.com/airbytehq/airbyte/pull/33758) | Fix for docs name | +| 0.1.0 | 2021-12-06 | [32655](https://github.com/airbytehq/airbyte/pull/32655) | New source |