kazunori279 · 0xrushi · May 9, 2021 · May 9, 2021 · May 10, 2021 · May 15, 2021
diff --git a/README.md b/README.md
@@ -1,22 +1,26 @@
-# pdf2audiobook
+# PDF2Audiobook
 
-See this [video](https://www.youtube.com/watch?v=_JVRew5zXBQ) for learning how the tool works.
+Check out my [blog](https://konfido.github.io/Convert-PDFs-to-Audiobooks-with-Machine-Learning/) for step-by-step guide and learning how this tool works.
 
-## pdf2audiobook training process
+And see the original articles ([Kazunori](https://cloud.google.com/blog/ja/products/ai-machine-learning/practical-machine-learning-with-automl-series-3), [Morkowitz](https://daleonai.com/pdf-to-audiobook)) and videos ([Kazunori](https://www.youtube.com/watch?v=_JVRew5zXBQ), [Markowitz](https://www.youtube.com/watch?v=q-nvbuc59Po)) for reference.
 
-1. Create a training data as CSV file
-2. Train a AutoML Tables model
-3. use pdf2audiobook for generating mp3 files
 
-## pdf2audiobook usage
 
-Register the code with Cloud Functions the following command. You need to create the bucket beforehand as a workspace for pdf2audiobook.
+## Main process
 
-`gcloud functions deploy p2a_gcs_trigger --runtime python37 --trigger-bucket <bucket> --memory=2048MB --timeout=540`
+1. Set `ANNOTATION_MODE = True` (/functions) for generating annotation data
+2. By default, the [pdfminer.six](https://github.com/pdfminer/pdfminer.six) is used to extract the content of pdf files, you can set `NO_OCR=False` to use Google OCR instead.
+3. Use annotation tool (/apps-script) to create training data
+4. Train a AutoML Tables model
+5. Set `ANNOTATION_MODE = False` for generating mp3 files
 
-## Annotation
 
-- Annotation mode usage: to use pdf2audiobook for generating annotation data, set `ANNOTATION_MODE = True` and re-register the code with Cloud Funtions, so the tool will generate CSV files for annotation instead of mp3 files.
 
-- Annotation tool: use /apps-script code for running the annotation tool with Google Apps Script
+## Deploy with command line
+
+You can operate entirely in the web editors, but if you prefer registering and deploying with command line, use the following command. You need to create the bucket beforehand as a workspace for PDF2Audiobook.
+
+`gcloud functions deploy <FUNCTION_NAME> --runtime python37 --trigger-bucket <BUCKET> --memory=2048MB --timeout=540`
+
+
 
diff --git a/apps-script/auth.gs b/apps-script/auth.gs
@@ -0,0 +1,95 @@
+
+
+/**
+ * doGet, entrance of this App
+*/
+function doGet(e) {
+  Logger.log(e.parameter);
+  var storageService = getService();
+  if (storageService.hasAccess()) {
+    var html = HtmlService.createTemplateFromFile('index');
+    return html.evaluate();
+  } else {
+    // Show the clickable authorization url
+    var authorizationUrl = storageService.getAuthorizationUrl();
+    var template = HtmlService.createTemplate(
+      '<a>Click the link ---> </a>'+
+      '<a href="<?= authorizationUrl ?>"target="_blank">Authorize</a>'+
+      '<p> Refresh this page after you complete the authorization.</p>'
+    );
+    template.authorizationUrl = authorizationUrl;
+    Logger.log('Open the following URL and re-run the script: %s', authorizationUrl);
+    return template.evaluate();
+  }
+}
+
+/**
+ * Make a request to Google Storage API.
+ */
+function authFetch(url, objectSource) {
+  if ( typeof objectSource == "undefined" ) {
+    // normal GET
+    return UrlFetchApp.fetch(url, {
+      method: "GET",
+      headers: {
+        Authorization: 'Bearer '+ storageService.getAccessToken(),
+      },
+      'muteHttpExceptions': true,
+      });
+  } else {
+    // copy objectSource to url
+    return UrlFetchApp.fetch(url, {
+      method: "PUT",
+      headers: {
+        Authorization: 'Bearer '+ storageService.getAccessToken(),
+        "x-goog-copy-source": objectSource,
+      },
+      'muteHttpExceptions': true,
+      });
+  }
+}
+
+/**
+ * Reset the authorization state, so that it can be re-tested.
+ */
+function reset() {
+  getService().reset();
+}
+
+/**
+ * Configures the service.
+ */
+function getService() {
+  return OAuth2.createService('storage')
+    .setAuthorizationBaseUrl('https://accounts.google.com/o/oauth2/auth')
+    .setTokenUrl('https://accounts.google.com/o/oauth2/token')
+    .setClientId(CLIENT_ID)
+    .setClientSecret(CLIENT_SECRET)
+    .setCallbackFunction('authCallback')
+    .setPropertyStore(PropertiesService.getUserProperties())
+    .setScope('https://www.googleapis.com/auth/devstorage.read_write')
+    .setParam('access_type', 'offline')
+    .setParam('approval_prompt', 'force')
+    // .setParam('prompt', 'consent')
+    .setParam('login_hint', Session.getActiveUser().getEmail());
+}
+
+/**
+ * Handles the OAuth callback.
+ */
+function authCallback(request) {
+  var service = getService();
+  var authorized = service.handleCallback(request);
+  if (authorized) {
+    return HtmlService.createHtmlOutput('Success!');
+  } else {
+    return HtmlService.createHtmlOutput('Denied.');
+  }
+}
+
+/**
+ * Logs the redict URI to register in the Google Developers Console.
+ */
+function logRedirectUri() {
+  Logger.log(OAuth2.getRedirectUri());
+}
diff --git a/apps-script/do_get.gs b/apps-script/do_get.gs
@@ -21,6 +21,10 @@
 var PDF_NAME = '<YOUR PDF FILE NAME>';
 var BUCKET_NAME = '<YOUR BUCKET NAME>';
 var SHEET_ID = '<YOUR SHEET ID>';
+// var TOKEN = "<YOUR OAUTH2 TOKEN>";
+
+var CLIENT_ID = '<YOUR-CLIENT-ID>';
+var CLIENT_SECRET = '<YOUR-CLIENT-SECRET>';
 
 //
 // init
@@ -43,15 +47,6 @@ function getHeaderList() {
   return sheet.getRange(1, 1, 1, sheet.getLastColumn()).getValues()[0];
 }
 
-//
-// doGet
-//
-
-function doGet(e) {
-  var html = HtmlService.createTemplateFromFile('index');
-  return html.evaluate();
-}
-
 //
 // RPCs
 //
@@ -62,29 +57,47 @@ function getImageUrl() {
   return imageUrl;
 }
 
+
 // download labels CSVs from GCS and create a Sheet
 function downloadLabels() {
 
   // download all CSV files from GCS
-  var csv = '';  
-  for (var i = 1; true; i += 100) {    
-    
+  var csv = '';
+  for (var i = 1; true; i += 100) {
+
     // build CSV URL like 'https://storage.googleapis.com/foo-bucket/foo-001-labels.csv'
     var batchId = pdfId + '-' + ('000' + i).slice(-3);
-    var url = bucketUrl + batchId + '-labels.csv';
-
-    // download the csv
-    var resp = UrlFetchApp.fetch(url, {'muteHttpExceptions': true});
-    if (resp.getResponseCode() == 200) {    
-      csv += resp.getContentText('UTF-8');
+    var urlFeature = bucketUrl + batchId + '-features.csv'
+    var urlLabel = bucketUrl + batchId + '-labels.csv';
+    var respFeature = authFetch(urlFeature)
+    var respLabel = authFetch(urlLabel)
+
+    if (respLabel.getResponseCode() == 200 ) {
+      // Already predicted a label file
+      csv += respLabel.getContentText('UTF-8');
+    } else if (respFeature.getResponseCode() == 200) {
+      // Exist feature file, forge a '...-labels.csv' file
+      csv += respFeature.getContentText('UTF-8')
     } else {
+      Logger.log(respLabel.getContentText());
+      Logger.log(respFeature.getContentText());
       break;
     }
   }
-  
+
   // parse CSV
   var labelData = Utilities.parseCsv(csv);
 
+  // Add a 'label' column if it not exist
+  if (!labelData[0].includes("label")) {
+    // Set "label" to "body" if "text".length>100
+    labelData.map(x => {
+      x.push((x[1].length>100)?"body":"other");
+      return x;
+    });
+    labelData[0][labelData[0].length-1] = "label"
+  }
+
   // rename old sheet if needed
   var oldSheet = getSheet();
   if (oldSheet) {
@@ -112,7 +125,7 @@ function updateLabel(id, label) {
 
 // returns paraDict as JSON encoded string
 function getParaDict() {
-  
+
   // check if the sheet is available
   if (getSheet() == null) {
     Logger.log('The sheet for ' + pdfId + ' is not available');
@@ -121,31 +134,31 @@ function getParaDict() {
 
   // build paraDict
   var paraDict = buildParaDictFromSheet();
-  
+
   // return as JSON string
   Logger.log('paraDict returned.');
   return JSON.stringify(paraDict);
 }
 
-function buildParaDictFromSheet() {  
-  
+function buildParaDictFromSheet() {
+
   // read rows from the sheet
   var paraDict = new Object();
   var sheet = getSheet();
   var headerList = getHeaderList();
   var vals = sheet.getRange(2, 1, sheet.getLastRow(), sheet.getLastColumn()).getValues();
-  
+
   // build page dict
   var pageCount = 0;
   var paraCount = 0;
   vals.forEach(function(row) {
-    
+
     // parse feature values
     var features = new Object();
     for (var i in row) {
       features[headerList[i]] = row[i];
     }
-    
+
     // parse id
     var m = features.id.match(/(.*)-([0-9]+)-([0-9]+)/);
     if (!m) return;
@@ -159,16 +172,15 @@ function buildParaDictFromSheet() {
       pageCount++;
     }
     paraDict[page].push(features);
-    
+
     // sort by the area size of para
     paraDict[page].sort(function (a, b) {
       return a.area < b.area ? 1 : (a.area == b.area ? 0 : -1);
     });
     paraCount++;
   });
-  
+
   Logger.log('Built paraDict with ' + pageCount + ' pages, ' + paraCount + ' paragraphs.');
   return paraDict
 
 }
-