Speedrunning duplicate code

dpla · Jan 2, 2025 · 7f024b3 · 7f024b3
1 parent c133be6
commit 7f024b3
Show file tree

Hide file tree

Showing 17 changed files with 96 additions and 343 deletions.
diff --git a/src/main/scala/dpla/ingestion3/harvesters/Harvester.scala b/src/main/scala/dpla/ingestion3/harvesters/Harvester.scala
@@ -2,6 +2,7 @@ package dpla.ingestion3.harvesters
 
 import java.io.File
 import dpla.ingestion3.confs.i3Conf
+import dpla.ingestion3.harvesters.file.ParsedResult
 import dpla.ingestion3.utils.{FlatFileIO, Utils}
 import org.apache.avro.Schema
 import org.apache.avro.file.DataFileWriter
@@ -10,6 +11,7 @@ import org.apache.commons.io.FileUtils
 import org.apache.logging.log4j.LogManager
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, SparkSession}
+import scala.xml._
 
 abstract class Harvester(
     spark: SparkSession,
@@ -72,6 +74,17 @@ abstract class LocalHarvester(
 
   def getAvroWriter: DataFileWriter[GenericRecord] = avroWriter
 
+  def writeOut(unixEpoch: Long, item: ParsedResult): Unit = {
+    val avroWriter = getAvroWriter
+    val genericRecord = new GenericData.Record(Harvester.schema)
+    genericRecord.put("id", item.id)
+    genericRecord.put("ingestDate", unixEpoch)
+    genericRecord.put("provider", shortName)
+    genericRecord.put("document", item.item)
+    genericRecord.put("mimetype", mimeType)
+    avroWriter.append(genericRecord)
+  }
+
   override def cleanUp(): Unit = {
     avroWriter.flush()
     avroWriter.close()
@@ -82,6 +95,18 @@ abstract class LocalHarvester(
 
 object Harvester {
 
+
+  /** Converts a Node to an xml string
+   *
+   * @param node
+   *   The root of the tree to write to a string
+   * @return
+   *   a String containing xml
+   */
+  def xmlToString(node: Node): String =
+    Utility.serialize(node, minimizeTags = MinimizeMode.Always).toString
+
+
   // Schema for harvested records.
   val schema: Schema =
     new Schema.Parser()

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/CommunityWebsHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/CommunityWebsHarvester.scala
@@ -28,20 +28,6 @@ class CommunityWebsHarvester(
 
   protected val extractor = new FlFileExtractor()
 
-  /** Loads .zip files
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   ZipInputstream of the zip contents
-    */
-  def getInputStream(file: File): Option[ZipInputStream] =
-    file.getName match {
-      case zipName if zipName.endsWith("zip") =>
-        Some(new ZipInputStream(new FileInputStream(file)))
-      case _ => None
-    }
-
   /** Parses JValue to extract item local item id and renders compact full
     * record
     *
@@ -114,7 +100,7 @@ class CommunityWebsHarvester(
     inFiles
       .listFiles(zipFilter)
       .foreach(inFile => {
-        val inputStream: ZipInputStream = getInputStream(inFile)
+        val inputStream: ZipInputStream = FileHarvester.getZipInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException("Couldn't load ZIP files.")
           )

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/DlgFileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/DlgFileHarvester.scala
@@ -33,19 +33,6 @@ class DlgFileHarvester(
 
   protected val extractor = new DlgFileExtractor()
 
-  /** Loads .zip files
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   ZipInputstream of the zip contents
-    */
-  def getInputStream(file: File): Option[ZipInputStream] =
-    file.getName match {
-      case zipName if zipName.endsWith("zip") =>
-        Some(new ZipInputStream(new FileInputStream(file)))
-      case _ => None
-    }
 
   /** Parses JValue to extract item local item id and renders compact full
     * record
@@ -118,7 +105,7 @@ class DlgFileHarvester(
     inFiles
       .listFiles(zipFilter)
       .foreach(inFile => {
-        val inputStream: ZipInputStream = getInputStream(inFile)
+        val inputStream: ZipInputStream = FileHarvester.getZipInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException("Couldn't load ZIP files.")
           )

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/DplaJsonlFileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/DplaJsonlFileHarvester.scala
@@ -32,21 +32,6 @@ class DplaJsonlFileHarvester(
 
   protected val extractor = new DplaJsonlFileExtractor()
 
-  /** Loads .zip files containing DPLA JSONL
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   FileInputStream of the file contents
-    */
-  def getInputStream(file: File): Option[ZipInputStream] = {
-    file.getName match {
-      case zipName if zipName.endsWith("zip") =>
-        Some(new ZipInputStream(new FileInputStream(file)))
-      case _ => None
-    }
-  }
-
   /** Parses JValue to extract item local item id and renders compact full
     * record
     *
@@ -122,7 +107,7 @@ class DplaJsonlFileHarvester(
     inFiles
       .listFiles(zipFilter)
       .foreach(inFile => {
-        val inputStream: ZipInputStream = getInputStream(inFile)
+        val inputStream: ZipInputStream = FileHarvester.getZipInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException("Couldn't load ZIP files.")
           )

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/FileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/FileHarvester.scala
@@ -1,6 +1,6 @@
 package dpla.ingestion3.harvesters.file
 
-import java.io.{BufferedReader, InputStreamReader}
+import java.io.{BufferedReader, File, FileInputStream, InputStreamReader}
 import dpla.ingestion3.confs.i3Conf
 import dpla.ingestion3.harvesters.{Harvester, LocalHarvester}
 import org.apache.avro.generic.GenericData
@@ -9,8 +9,9 @@ import org.apache.log4j.Logger
 import org.apache.spark.sql.SparkSession
 import org.apache.tools.tar.TarInputStream
 
-import java.util.zip.ZipInputStream
+import java.util.zip.{GZIPInputStream, ZipInputStream}
 import scala.util.Try
+import scala.xml._
 
 /** File based harvester
   *
@@ -46,19 +47,10 @@ abstract class FileHarvester(
     *   Harvested record
     */
 
-  def writeOut(unixEpoch: Long, item: ParsedResult): Unit = {
-    val avroWriter = getAvroWriter
-    val genericRecord = new GenericData.Record(Harvester.schema)
-    genericRecord.put("id", item.id)
-    genericRecord.put("ingestDate", unixEpoch)
-    genericRecord.put("provider", shortName)
-    genericRecord.put("document", item.item)
-    genericRecord.put("mimetype", mimeType)
-    avroWriter.append(genericRecord)
-  }
 
   def flush(): Unit = getAvroWriter.flush()
 
+
 }
 
 /** Case class to hold the results of a file
@@ -83,6 +75,33 @@ case class FileResult(
 case class ParsedResult(id: String, item: String)
 
 object FileHarvester {
+
+  def getZipInputStream(file: File): Option[ZipInputStream] =
+    file.getName match {
+      case zipName if zipName.endsWith("zip") =>
+        Some(new ZipInputStream(new FileInputStream(file)))
+      case _ => None
+    }
+
+  /** Loads .tar.gz files
+   *
+   * @param file
+   *   File to parse
+   * @return
+   *   Option[TarInputStream] of the zip contents
+   */
+  def getTarInputStream(file: File): Option[TarInputStream] = {
+    file.getName match {
+      case zipName if zipName.endsWith("gz") =>
+        Some(new TarInputStream(new GZIPInputStream(new FileInputStream(file))))
+      case zipName if zipName.endsWith("tar") =>
+        Some(new TarInputStream(new FileInputStream(file)))
+
+      case _ => None
+    }
+  }
+
+
   def iter(zipInputStream: ZipInputStream): LazyList[FileResult] =
     Option(zipInputStream.getNextEntry) match {
       case None =>

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/FlFileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/FlFileHarvester.scala
@@ -1,14 +1,14 @@
 package dpla.ingestion3.harvesters.file
 
-import java.io.{BufferedReader, File, FileInputStream, InputStreamReader}
+import java.io.{File, FileInputStream}
 import java.util.zip.ZipInputStream
 import dpla.ingestion3.confs.i3Conf
 import dpla.ingestion3.harvesters.file.FileFilters.zipFilter
 import dpla.ingestion3.mappers.utils.JsonExtractor
 import dpla.ingestion3.model.AVRO_MIME_JSON
 import org.apache.avro.generic.GenericData
 import org.apache.commons.io.IOUtils
-import org.apache.log4j.Logger
+
 import org.apache.logging.log4j.LogManager
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -33,20 +33,6 @@ class FlFileHarvester(
 
   protected val extractor = new FlFileExtractor()
 
-  /** Loads .zip files
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   ZipInputstream of the zip contents
-    */
-  def getInputStream(file: File): Option[ZipInputStream] = {
-    file.getName match {
-      case zipName if zipName.endsWith("zip") =>
-        Some(new ZipInputStream(new FileInputStream(file)))
-      case _ => None
-    }
-  }
 
   /** Parses JValue to extract item local item id and renders compact full
     * record
@@ -121,7 +107,7 @@ class FlFileHarvester(
     inFiles
       .listFiles(zipFilter)
       .foreach(inFile => {
-        val inputStream: ZipInputStream = getInputStream(inFile)
+        val inputStream: ZipInputStream = FileHarvester.getZipInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException("Couldn't load ZIP files.")
           )

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/HathiFileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/HathiFileHarvester.scala
@@ -1,11 +1,10 @@
 package dpla.ingestion3.harvesters.file
 
-import java.io.{File, FileInputStream}
-import java.util.zip.GZIPInputStream
+import java.io.File
 import dpla.ingestion3.confs.i3Conf
+import dpla.ingestion3.harvesters.Harvester
 import org.apache.commons.io.IOUtils
 import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.tools.tar.TarInputStream
 import dpla.ingestion3.harvesters.file.FileFilters.gzFilter
 import dpla.ingestion3.mappers.utils.XmlExtractor
 import dpla.ingestion3.model.AVRO_MIME_XML
@@ -26,24 +25,6 @@ class HathiFileHarvester(
 
   def mimeType: GenericData.EnumSymbol = AVRO_MIME_XML
 
-  /** Loads .tar.gz files
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   Option[TarInputStream] of the zip contents
-    */
-  def getInputStream(file: File): Option[TarInputStream] = {
-    file.getName match {
-      case zipName if zipName.endsWith("gz") =>
-        Some(new TarInputStream(new GZIPInputStream(new FileInputStream(file))))
-      case zipName if zipName.endsWith("tar") =>
-        Some(new TarInputStream(new FileInputStream(file)))
-
-      case _ => None
-    }
-  }
-
   /** Takes care of parsing an xml file into a list of Nodes each representing
     * an item
     *
@@ -66,7 +47,7 @@ class HathiFileHarvester(
           .headOption
           .flatten
 
-        val outputXML = xmlToString(record)
+        val outputXML = Harvester.xmlToString(record)
 
         id match {
           case None =>
@@ -91,7 +72,7 @@ class HathiFileHarvester(
       .listFiles(gzFilter)
       .foreach(inFile => {
 
-        val inputStream = getInputStream(inFile)
+        val inputStream = FileHarvester.getTarInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException(
               s"Couldn't load file, ${inFile.getAbsolutePath}"
@@ -120,16 +101,6 @@ class HathiFileHarvester(
     spark.read.format("avro").load(tmpOutStr)
   }
 
-  /** Converts a Node to an xml string
-    *
-    * @param node
-    *   The root of the tree to write to a string
-    * @return
-    *   a String containing xml
-    */
-  def xmlToString(node: Node): String =
-    Utility.serialize(node, minimizeTags = MinimizeMode.Always).toString
-
   /** Main logic for handling individual entries in the tar.
     *
     * @param tarResult

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/HeartlandFileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/HeartlandFileHarvester.scala
@@ -32,20 +32,6 @@ class HeartlandFileHarvester(
 
   protected val extractor = new HeartlandFileExtractor()
 
-  /** Loads .zip files
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   ZipInputstream of the zip contents
-    */
-  def getInputStream(file: File): Option[ZipInputStream] = {
-    file.getName match {
-      case zipName if zipName.endsWith("zip") =>
-        Some(new ZipInputStream(new FileInputStream(file)))
-      case _ => None
-    }
-  }
 
   /** Parses JValue to extract item local item id and renders compact full
     * record
@@ -121,7 +107,7 @@ class HeartlandFileHarvester(
     inFiles
       .listFiles(zipFilter)
       .foreach(inFile => {
-        val inputStream: ZipInputStream = getInputStream(inFile)
+        val inputStream: ZipInputStream = FileHarvester.getZipInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException("Couldn't load ZIP files.")
           )

diff --git a/src/main/scala/dpla/ingestion3/harvesters/file/NYPLFileHarvester.scala b/src/main/scala/dpla/ingestion3/harvesters/file/NYPLFileHarvester.scala
@@ -37,20 +37,6 @@ class NYPLFileHarvester(
 
   protected val extractor = new FlFileExtractor()
 
-  /** Loads .zip files
-    *
-    * @param file
-    *   File to parse
-    * @return
-    *   ZipInputstream of the zip contents
-    */
-  def getInputStream(file: File): Option[ZipInputStream] = {
-    file.getName match {
-      case zipName if zipName.endsWith("zip") =>
-        Some(new ZipInputStream(new FileInputStream(file)))
-      case _ => None
-    }
-  }
 
   /** @param json
     *   Full JSON item record
@@ -130,7 +116,7 @@ class NYPLFileHarvester(
     inFiles
       .listFiles(zipFilter)
       .foreach(inFile => {
-        val inputStream: ZipInputStream = getInputStream(inFile)
+        val inputStream: ZipInputStream = FileHarvester.getZipInputStream(inFile)
           .getOrElse(
             throw new IllegalArgumentException("Couldn't load ZIP files.")
           )