diff --git a/packages/gmail/CHANGELOG.md b/packages/gmail/CHANGELOG.md index c21e07c84..a7842812d 100644 --- a/packages/gmail/CHANGELOG.md +++ b/packages/gmail/CHANGELOG.md @@ -1,5 +1,18 @@ # @openfn/language-gmail +## 1.0.1 + +### Patch Changes + +- 01b4aa9: This patch includes breaking changes to the API - but since the + adpator has only been released a couple of days we don't anticipate this + affecting any users. + + - Removed the `userId` parameter from `getContentsFromMessages()`. Pass + `options.email` instead. + - Renamed `options.desiredContents` to `options.contents` + - Revised documentation for clarity and accuracy + ## 1.0.0 Initial release. diff --git a/packages/gmail/README.md b/packages/gmail/README.md index 3576c52f7..9ba80e6b9 100644 --- a/packages/gmail/README.md +++ b/packages/gmail/README.md @@ -1,103 +1,140 @@ -# Gmail Message Content Extraction +# Gmail Adaptor -This adaptor is used to extract specific content from Gmail messages using -custom "desiredContent" configurations. The sample code specifies how to query -Gmail for messages and identify desired attachments and metadata. +## What it does -## How It Works +This adaptor is used to extract specific content from Gmail messages using custom desired "content" configurations. The sample code specifies how to query Gmail for messages and identify desired attachments and metadata. -1. Gmail Query: Constructs a Gmail query string to filter relevant messages. -2. Desired Content Matching: Uses the desiredContents array to identify and - extract: - - Metadata files - - Archive files and their contents - - Message metadata (subject, date, from, body) -3. Output: Returns a structured collection of matched content. +Without any parameters, the `getContentsFromMessages()` function will return an array containing every message in the account of the authenticated user including `from`, `date` and `subject`. -## Usage +A number of options are available to isolated the desired messages and to customize the output. -The adaptor's primary function is `getContentsFromMessages` +## Options -```js -getContentsFromMessages(userId, query, desiredContents, callback); -``` +Optional parameters include: `contents`, `query`, `email`, `processedIds` -1. Set userId with the Gmail account to query. -2. Customize the query to contain filters as needed. This is the same format a - the query in the Gmail UI. -3. Specify what content to retrieve from messages (body, subject, attachments, - etc) +### options.contents -## Extracting Message Contents +Use the `options.contents` array to specify the content to retrieve from each message. Always included are `from`, `date`, and `subject`. -The `desiredContents` array should list what content to retrieve from each -message. Each item should be a string (ie, `"body"` or an object describing an -attachment) +Each item can be a simple string (ie, `'body'`, `'subject'`) or an MessageContent object offering advanced configuration. -### Metadata +#### Basic metadata -The following strings can be extracted: +The following types of content can be extracted: - `body`: Extracts the message body. - `subject`: Extracts the email subject. - `date`: Extracts the timestamp of the email. - `from`: Extracts the sender's information. -### Attachment: basic file +Optionally, each of these content strings can be expanded to include additional specifications: + +```js +const mySubject = { + type: 'subject', + name: 'email-title', + maxLength: 25, +}; +``` + +- The `type` property instructs the function which content type to extract. +- The `name` property allows you to add a custom name to this information. +- The `maxLength` property allows you to limit the length of the content returned. + +#### Attachment: basic file + +Extract content from a file attachment. -Extract the content from a file attachment. Specify the file name with a regular -expression on the `file` key. +`file`: Identify the specific file inside the archive by providing its name as a string or using a regular expression to matching a pattern. ```js -{ - type: "file", - name: "metadata", +const myMetadata = { + type: 'file', + name: 'metadata', file: /^summary\.txt$/, -} +}; +``` + +```js +const myMetadata = { + type: 'file', + file: 'summary.txt', + maxLength: 500, +}; ``` -### Attachment: archived file +#### Attachment: archived file -Extract the content from a file embedded in an archive attachment. +Extract content from a file embedded in an archive attachment. -Specify the archive with a regular expression on the `archive` key. Extract a -file within the archive with the `file` key. +- `archive`: Specify the file name of the archive using either a string for an exact match or a regular expression to match a pattern. +- `file`: Identify the specific file inside the archive by providing its name as a string or using a regular expression to match a pattern. ```js -{ - type: "archive", - name: "data", - archive: /_device_data\.zip$/, +const myArchivedFile = { + type: 'archive', + name: 'data', + archive: 'devicedata.zip', file: /_CURRENT_DATA_\w*?\.json$/, -} + maxLength: 5000, +}; ``` -## Query Setup +```js +options.contents = [mySubject, 'body', myMetadata, myArchivedFile]; +``` -The query variable is constructed to filter Gmail messages: +### options.query -- Inbox messages with the subject containing "30DTR Data". -- Messages sent within the last 31 days. +Use a `query` parameter to filter the messages returned. -## Example +The query syntax supports the same query format as the Gmail `search` box. -```js -const userId = 'tester@gmail.com'; +``` +options.query = 'from:someuser@example.com rfc822msgid: is:unread'; +``` + +A full list of supported search operations can be found here: [Refine searches in Gmail](https://support.google.com/mail/answer/7190) + +### options.email + +Optionally specify the email address used for the Gmail account. This almost always the same email associated with the authenticated user so this parameter is optional. + +``` +options.email = ''; +``` -const querySubject = encodeURIComponent('device data summary'); +### options.processedIds -// All messages newer than 30 days ago -const MILLISECONDS_PER_DAY = 1000 * 60 * 60 * 24; -const queryAfterDate = new Date(Date.now() - 30 * MILLISECONDS_PER_DAY) - .toISOString() - .split('T')[0]; +In some scenarios, it may be necessary to skip certain messages to prevent the retrieval of duplicate data. Passing an array of messageIds will allow the function to skip these messages if any of the ids are encountered in the returned messages. -const query = `in:inbox subject:${querySubject} after:${queryAfterDate}`; +``` +options.processedIds = [ + '194e3cf1ca0ccd66', + '283e2df2ca0ecd75', + '572e1af3ca0bcd84', +]; +``` + +## Example jobs + +```js +const query = 'in:inbox newer_than:2d'; +const contents = ['body']; +getContentsFromMessages({ query, contents }); +``` + +```js +const subject = 'device data summary'.replace(' ', '+'); +const query = `in:inbox subject:${subject} newer_than:1m`; + +const email = 'special_assigned_delegate@gmail.com'; const metadataFile = { type: 'file', name: 'metadata', - file: /^summary\.txt$/, + file: /summary\.txt$/, + maxLength: 500, }; const dataFile = { @@ -107,63 +144,114 @@ const dataFile = { file: /_CURRENT_DATA_\w*?\.json$/, }; -const desiredContents = [ - 'body', - 'subject', - 'date', - 'from', - metadataFile, - dataFile, -]; +const contents = [metadataFile, dataFile]; -getContentsFromMessages(userId, query, desiredContents, state => - console.log(state.data) -); +getContentsFromMessages({ query, email, contents }); ``` -## Sample Output +## Sample `state.data` Output -For each matched message, the extracted content is returned in a collection of -content blocks. Here's an example for a single match message: +For each matched message, the extracted content is returned as a message object of content properties. Here's an example `state.data` for a single matched message: ```js [ { messageId: '1934c017c1752c01', - contents: [ - { - name: 'subject', - value: 'Fwd: FW: Facility Temperature Report (Summary Data)', - }, - { - name: 'date', - value: '2024-11-20T23:56:08.000Z', - }, - { - name: 'from', - value: 'Friendly Sender ', - }, - { - name: 'metadata', - value: - '{\n "appInfo": {\n "isAutomaticTime": true,\n "isTrueTime": true,\n "os": "Android",\n "osVe" }', - path: '004800123457501820383131_20241115T102926Z.json', - }, - { - name: 'data', - value: - '{\n "AMOD": "VL45",\n "AMFR": "ICECO",\n "ASER": "BJBC 08 30",\n "ADOP": "2024-04-01",\n "APQS": "E003/XX" }', - path: '004800123457501820383131_CURRENT_DATA_P100DT9H45M46S_20241115T102926Z.json', - }, - ], + from: 'Friendly Sender ', + date: '2024-11-20T23:56:08.000Z', + subject: 'Fwd: FW: Facility Anomaly Report (Summary Data)', + metadata: { + filename: 'daily_summary.txt', + content: '{ "appInfo": { "isAutomaticTime": true }', + }, + data: { + archiveFilename: '0031_device_data.zip', + filename: '0031_CURRENT_DATA_P100DT9H45M46S_20241115T102926Z.json', + content: '{ "AMOD": "VL45", "AMFR": "ICECO" }', + }, }, ]; ``` -Each content block represents a specific piece of information extracted: +Each property on the message object represents a specific piece of information extracted: -- **subject**: Contains the email subject. -- **date**: The timestamp when the email was sent. - **from**: Sender's email and name. -- **metadata**: Metadata file content, with its file path. -- **data**: Data file content, with its file path. +- **date**: The timestamp when the email was sent. +- **subject**: Contains the email subject. +- **metadata**: Metadata-named file content, with its matched file name. +- **data**: Data-named archive file content, with its matched archive name and file name. + +## Acquiring an access token + +The Gmail adaptor implicitly uses the Gmail account of the Google account that is used to authenticate the application. + +Allowing the Gmail adaptor to access a Gmail account is a multi-step process. + +### Create an OAuth 2.0 client ID + +Follow the instructions are found here: +https://support.google.com/googleapi/answer/6158849 + +- Go to [Google Cloud Platform Console](https://console.cloud.google.com/) +- Go to "APIs & Services" +- Click "Credentials" +- Click "Create Credentials" +- Select "OAuth client ID" +- Select "Create OAuth client ID" +- Select Application type "Web application" + - Add a uniquely-identifiable name + - Click "Create" +- On the resulting popup screen, find and click "DOWNLOAD JSON" and save this file to a secure location. + +### Use the Postman application to query the OAuth enpoint and retrieve an access token + +Initially, you'll need to configure an authentication request using Postman's built-in OAuth 2.0 implementation: + +- Open Postman +- Create a new request +- Switch to the "Authorization" tab +- On the left side, select Type OAuth 2.0 +- On the right side, scroll down to the "Configure New Token" section +- Fill out the form using information from the downloaded json file from the + previous section + - Token Name: Google Oauth + - Grant Type: Authorization Code + - Auth URL: (found in the json file as auth_url) + - Access Token URL: (found in the json file as token_url) + - Client ID: (found in the json file as client_id) + - Client Secret: (found in the json file as client_secret) + - Scope: https://www.googleapis.com/auth/gmail.readonly + - State: (any random string is fine) + - Client Authentication: Send as Basic Auth header + +Once the form is filled out, repeat these steps each hour to retrieve a new +access token: + +- Click on "Get New Access Token" +- A browser will open and you'll be asked to authenticate with your Google Account +- Accept the request to allow this OAuth session to access your Google Account. +- In the MANAGE ACCESS TOKENS popup, find and copy the new Access Token +- This access token will be valid for 1 hour. + +### Configure OpenFn CLI to find the access token + +The Gmail adaptor looks for the access token in the configuration section under `access_token`. + +Example configuration using a workflow: + +``` +"workflow": { + "steps": [ + { + "id": "getGmailContent", + "adaptors": [ + "gmail" + ], + "expression": "path/to/gmail.js", + "configuration": { + "access_token": "(access token acquired from Postman)" + } + } + ] +} +``` diff --git a/packages/gmail/assets/rectangle.png b/packages/gmail/assets/rectangle.png index 5fb8b66e0..9028989ea 100644 Binary files a/packages/gmail/assets/rectangle.png and b/packages/gmail/assets/rectangle.png differ diff --git a/packages/gmail/assets/square.png b/packages/gmail/assets/square.png index 60f00c1f3..e88a44e17 100644 Binary files a/packages/gmail/assets/square.png and b/packages/gmail/assets/square.png differ diff --git a/packages/gmail/ast.json b/packages/gmail/ast.json index 73aee084e..93edd1ac5 100644 --- a/packages/gmail/ast.json +++ b/packages/gmail/ast.json @@ -3,8 +3,7 @@ { "name": "getContentsFromMessages", "params": [ - "userId", - "userOptions" + "options" ], "docs": { "description": "Used to isolate the type of content to retrieve from the message.", @@ -16,7 +15,7 @@ "type": "NameExpression", "name": "Object" }, - "name": "DesiredContent" + "name": "MessageContent" }, { "title": "public", @@ -34,20 +33,15 @@ }, { "title": "property", - "description": "A custom description for the content type. Optional.", + "description": "A custom description for the content type.", "type": { "type": "OptionalType", "expression": { - "type": "NullableType", - "expression": { - "type": "NameExpression", - "name": "string" - }, - "prefix": false + "type": "NameExpression", + "name": "string" } }, - "name": "name", - "default": "null" + "name": "name" }, { "title": "property", @@ -93,7 +87,7 @@ }, { "title": "property", - "description": "Maximum number of characters to retrieve from the content. Optional.", + "description": "Maximum number of characters to retrieve from the content.", "type": { "type": "OptionalType", "expression": { @@ -105,8 +99,7 @@ "prefix": false } }, - "name": "maxLength", - "default": "null" + "name": "maxLength" } ] }, diff --git a/packages/gmail/package.json b/packages/gmail/package.json index e98ea63f2..ecb58aa25 100644 --- a/packages/gmail/package.json +++ b/packages/gmail/package.json @@ -1,6 +1,6 @@ { "name": "@openfn/language-gmail", - "version": "1.0.0", + "version": "1.0.1", "description": "OpenFn gmail adaptor", "type": "module", "exports": { diff --git a/packages/gmail/src/Adaptor.js b/packages/gmail/src/Adaptor.js index 7620bcaff..888ee6960 100644 --- a/packages/gmail/src/Adaptor.js +++ b/packages/gmail/src/Adaptor.js @@ -11,7 +11,7 @@ import { import { getMessagesResult, getMessageResult, - getDesiredContent, + getContentIndicators, getMessageContent, createConnection, removeConnection, @@ -19,70 +19,85 @@ import { /** * Used to isolate the type of content to retrieve from the message. - * @typedef {Object} DesiredContent + * @typedef {Object} MessageContent * @public - * @property {string} type - Message content type. Valid types: from, date, subject, body, archive, file. - * @property {string?} [name=null] - A custom description for the content type. Optional. + * @property {string} [type] - Message content type. Valid types: from, date, subject, body, archive, file. + * @property {string} [name] - A custom description for the content type. * @property {RegExp|string} [archive] - Identifier to isolate the desired attachment when type is 'archive'. * Use a regular expression for pattern matching or a string for a literal match. Required if type is 'archive'. * @property {RegExp|string} [file] - Identifier to isolate the desired attachment when type is 'file' or 'archive'. * Use a regular expression for pattern matching or a string for a literal match. Required if type is 'file' or 'archive'. - * @property {number?} [maxLength=null] - Maximum number of characters to retrieve from the content. Optional. + * @property {number?} [maxLength] - Maximum number of characters to retrieve from the content. */ /** * Configurable options provided to the Gmail adaptor. * @typedef {Object} Options * @public - * @property {string} userId - The email address of the Gmail account. - * @property {string?} [query=null] - Custom query to limit the messages result. Adheres to the Gmail search syntax. Optional. - * @property {Array?} [desiredContents=['from', 'date', 'subject', 'body']] - * An array of strings or DesiredContent objects used to specify which parts of the message to retrieve. Optional, default is `['from', 'date', 'subject', 'body']`. - * @property {Array?} [processedIds=null] - Ignore message ids which have already been processed. Optional. + * @property {string?} [query] - Gmail search query string. + * @property {Array} [contents=['from', 'date', 'subject', 'body']] + * An array of strings or MessageContent objects used to specify which parts of the message to retrieve. + * @property {Array} [processedIds] - Ignore message ids which have already been processed. + * @property {string?} [email] - The user account to retrieve messages from. Defaults to the authenticated user. */ /** - * Requests contents from messages of a Gmail account. + * Downloads contents from messages of a Gmail account. * @public * @function - * @param {string} userId - The email address of the account to retrieve messages from. - * @param {Options} userOptions - Customized options including desired contents and query. - * @returns {Function} A function that processes the state. - * @example + * @param {Options} options - Customized options including desired contents and query. + * @state {Array} data - The returned message objects, of the form `{ messageId, contents } ` + * @state {Array} processedIds - An array of string ids processed by this request + * @returns {Operation} + * @example Get a message with a specific subject + * getContentsFromMessages( + * { + * query: 'subject:my+test+message' + * } + * ) + * @example Get messages after a specific date, with subject and report.txt attachment * getContentsFromMessages( - * 'test@tester.com', * { - * query: 'in:inbox subject:my+test+message', - * desiredContents: ['date', 'from', 'subject', { type: 'body', maxLength: 50 }] + * query: 'after:15/01/2025', + * contents: [ + * 'subject', + * { type: 'file', name: 'metadata', file: 'report.txt'} + * ] * } * ) */ -export function getContentsFromMessages(userId, userOptions) { +export function getContentsFromMessages(options) { return async state => { - const [resolvedUserId, resolvedUserOptions] = expandReferences( - state, - userId, - userOptions - ); + const [resolvedOptions] = expandReferences(state, options); const defaultOptions = { - desiredContents: ['from', 'date', 'subject', 'body'], + contents: ['from', 'date', 'subject'], + userId: 'me', }; - const options = { ...defaultOptions, ...(resolvedUserOptions || {}) }; + const opts = { + userId: resolvedOptions.email ?? defaultOptions.userId, + query: resolvedOptions.query, + processedIds: resolvedOptions.processedIds, + }; + + const contentIndicators = getContentIndicators( + defaultOptions.contents, + resolvedOptions.contents, + ); const contents = []; const currentIds = []; - const previousIds = Array.isArray(options.processedIds) - ? options.processedIds + const previousIds = Array.isArray(opts.processedIds) + ? opts.processedIds : []; let nextPageToken = null; do { const messagesResult = await getMessagesResult( - resolvedUserId, - options.query, + opts.userId, + opts.query, nextPageToken ); @@ -104,23 +119,21 @@ export function getContentsFromMessages(userId, userOptions) { messageId: messageId, }; - const messageResult = await getMessageResult(userId, messageId); - - for (const desiredContentHint of options.desiredContents) { - const desiredContent = getDesiredContent(desiredContentHint); + const messageResult = await getMessageResult(opts.userId, messageId); + for (const contentIndicator of contentIndicators) { const messageContent = await getMessageContent( messageResult, - desiredContent + contentIndicator ); - if (messageContent && content[desiredContent.name]) { + if (messageContent && content[contentIndicator.name]) { throw new Error( - `Duplicate content name detected: ${desiredContent.name}` + `Duplicate content name detected: ${contentIndicator.name}` ); } - content[desiredContent.name] ??= messageContent; + content[contentIndicator.name] ??= messageContent; } contents.push(content); diff --git a/packages/gmail/src/Utils.js b/packages/gmail/src/Utils.js index 94f18c6bf..276b7fcf0 100644 --- a/packages/gmail/src/Utils.js +++ b/packages/gmail/src/Utils.js @@ -3,13 +3,13 @@ import { google } from 'googleapis'; let gmail; -export async function getMessagesResult(userId, query, lastPageToken) { +export async function getMessagesResult(userId, query, pageToken) { try { const { data } = await gmail.users.messages.list({ - userId: userId, + userId, q: query, - maxResults: 3, - pageToken: lastPageToken, + maxResults: 20, + pageToken, }); return { @@ -36,27 +36,48 @@ export async function getMessageResult(userId, messageId) { }; } -export function getDesiredContent(hint) { - const desiredContent = typeof hint === 'string' ? { type: hint } : hint; +export function getContentIndicators(defaultContentRequests, contentRequests) { + const indicators = new Map(); - if (!desiredContent.type) { - if (desiredContent.archive) { - desiredContent.type = 'archive'; - } else if (desiredContent.file) { - desiredContent.type = 'file'; + const requests = [ + ...(defaultContentRequests || []), + ...(contentRequests || []), + ]; + + for (const request of requests) { + const indicator = getContentIndicator(request); + indicators.set(indicator.type, indicator); + } + + return Array.from(indicators.values()); +} + +function getContentIndicator(contentRequest) { + const contentIndicator = + typeof contentRequest === 'string' + ? { type: contentRequest } + : { ...contentRequest }; + + if (!contentIndicator.type) { + if (contentIndicator.archive) { + contentIndicator.type = 'archive'; + } else if (contentIndicator.file) { + contentIndicator.type = 'file'; } } - if (!desiredContent.type) { - console.error(`Unable to determine desired content type: ${hint}`); + if (!contentIndicator.type) { + console.error( + `Unable to determine desired content type: ${contentRequest}` + ); throw new Error('No desired content type provided.'); } - if (!desiredContent.name) { - desiredContent.name = desiredContent.type; + if (!contentIndicator.name) { + contentIndicator.name = contentIndicator.type; } - return desiredContent; + return contentIndicator; } export async function getMessageContent(message, desiredContent) { @@ -70,9 +91,9 @@ export async function getMessageContent(message, desiredContent) { case 'body': return getBodyFromMessage(message, desiredContent); - case 'subject': case 'from': case 'date': + case 'subject': return getValueFromMessageHeader(message, desiredContent); default: