-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch.scala
190 lines (171 loc) · 6.79 KB
/
fetch.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import ammonite.ops._
import cats.implicits._
import io.chrisdavenport.cats.time._
import org.jsoup._
import org.rogach.scallop._
import io.circe._
import io.circe.parser._
import scala.collection.JavaConverters._
import scala.util.chaining._
import scala.util.control.Exception._
import scala.util.Try
import sys.process._
import wvlet.log.LogSupport
import java.nio.file.{Paths, Files}
import java.nio.charset.StandardCharsets
import java.time._
import java.time.format._
import java.io.File
object nytimes extends LogSupport {
object io {
def write(path: String, txt: String): Unit = {
Files.write(Paths.get(path), txt.getBytes(StandardCharsets.UTF_8))
}
def toPDF(targetDirectory: Path, briefing: Briefing): Unit = {
val inputFile = targetDirectory/briefing.htmlFilename
val outputFile = targetDirectory/briefing.pdfFilename
s"weasyprint $inputFile $outputFile" ! ProcessLogger(line => ())
}
def uploadToRemarkable(targetDirectory: Path, briefing: Briefing): Unit = {
val targetFile = targetDirectory/briefing.pdfFilename
s"rmapi put $targetFile" ! ProcessLogger(line => ())
}
}
val baseUrl = "https://www.nytimes.com/"
val usBriefingUrl = "spotlight/us-briefing"
case class Briefing(date: LocalDate, url: String, name: String, htmlFilename: String) {
def pdfFilename = htmlFilename.replaceAllLiterally(".html", ".pdf")
}
object Briefing {
def fetchUSSpotlight = Jsoup.connect(s"$baseUrl$usBriefingUrl").get()
def fetch(briefing: Briefing) = Jsoup.connect(s"$baseUrl${briefing.url}").get()
def urlToName(s: String) = s.replaceAllLiterally(".html", "").split("-").map(_.capitalize).mkString(" ")
def from(node: nodes.Element): Option[Briefing] = {
val url = node.attr("href")
val dateFormat = DateTimeFormatter.ofPattern("yyyy/MM/dd")
val dateString = url.stripPrefix("/").split('/').take(3).mkString("/")
val htmlFilename = url.split('/').lastOption
for {
date <- Try{LocalDate.parse(dateString, dateFormat)}.toOption
name <- htmlFilename.map(urlToName)
htmlFilename <- htmlFilename
} yield new Briefing(date, url, name, htmlFilename)
}
def toDoc(doc: nodes.Document): List[Briefing] =
doc.select("a").asScala
.filter(_.attr("href").contains("/briefing/"))
.filterNot(_.attr("href").contains("signup.html"))
.flatMap(Briefing.from)
.toList
}
val prefix = "window.__preloadedData = "
def briefingsData(doc: nodes.Document): Option[String] =
doc.select("script").asScala
.filter(_.data().startsWith(prefix))
.headOption
.map(_.data().replaceAllLiterally(prefix, "").dropRight(1))
def keyHasValue(key: String, value: String)(hcursor: ACursor) =
hcursor.get[String](key).toOption.contains(value)
// TODO: this code is nasty, but it was the first thing I go working. :D
def getImages(json: Json): List[String] = {
val hcursor = json.hcursor.downField("initialState")
hcursor.keys
.fold(List[String]())(
_.filterNot(_.endsWith("ledeMedia"))
.map(hcursor.downField)
.filter(keyHasValue("__typename", "ImageBlock"))
.map(_.downField("media"))
.filter(keyHasValue("typename", "Image"))
.flatMap(_.get[String]("id").toOption)
.map(k => hcursor.downField(k))
.flatMap(lens => {
lens.keys.map(
_.filter(_.startsWith("crops"))
.map(lens.downField)
.flatMap(_.values)
.flatten
.flatMap(_.hcursor.get[String]("id").toOption)
)
})
.flatten
.map(hcursor.downField)
//.flatMap(_.focus)
.flatMap(_.downField("renditions").values)
.flatten
// TODO: For some reason the following line failes
// if I use .get[String]("id").toOption!?!
.flatMap(_.hcursor.downField("id").as[String].toOption)
.filter(_.contains("superJumbo"))
.map(hcursor.downField)
.flatMap(_.get[String]("url").toOption)
.toList
)
}
def insertLazyImages(doc: nodes.Document)(json: Json): nodes.Document = {
val elements = doc.select("""figure div[data-testid="lazyimage-container"]""").iterator().asScala.to(List)
var images = getImages(json)
if (images.length > elements.length) {
images = images.drop(1);
}
for {
(url, element) <- (images zip elements)
} {
element.attr("style", "height: auto")
val picture = element.appendElement("picture")
picture.attr("style", "opacity: 1; display: block; width: 100%")
val img = picture.appendElement("img")
img.attr("class", "css-doesntmattr");
img.attr("src", f"${url}?quality=75&auto=webp&disable=upscale")
img.attr("decoding", "async")
img.attr("style", "width:100%;vertical-align:top; height: auto; max-width: 100%")
}
doc
}
def downloadAndProcessBriefing(targetDirectory: Path, briefing: Briefing): nodes.Document = {
val doc = Briefing.fetch(briefing)
doc.pipe(briefingsData)
.map(parse)
.getOrElse(Left("No Json to Parse"))
.tap(_.left.map(err => err.tap(err => error(s"Unable to parse JSON: $err"))))
.toOption
.map(insertLazyImages(doc))
.getOrElse(doc)
}
}
object fetch extends LogSupport {
def statFile(p: Path) = Try{stat! p}.toOption
def isFile(p: Path) = statFile(p).map(_.isFile) getOrElse false
def isDirectory(p: Path) = statFile(p).map(_.isFile) getOrElse false
class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
val targetDirectory = opt[String](required = true)
verify()
}
def main(args: Array[String]): Unit = {
val conf = new Conf(args)
val targetDirectory = Path(conf.targetDirectory());
if (!isDirectory(targetDirectory)) {
mkdir! targetDirectory
}
info("Downloading briefings")
val today = LocalDate.now
nytimes.Briefing.fetchUSSpotlight
.pipe(nytimes.Briefing.toDoc)
.filter(_.date == today)
.tap(b => info(f"Found ${b.length} briefings"))
.filterNot(briefing => isFile(targetDirectory/briefing.pdfFilename))
.tap(b => info(f"Updating ${b.length} briefings"))
.tapEach(briefing => {
info(f"Processing ${briefing.htmlFilename}")
val doc = nytimes.downloadAndProcessBriefing(targetDirectory, briefing)
nytimes.io.write((targetDirectory/briefing.htmlFilename).toString, doc.toString())
})
.tapEach(briefing => {
info(f"Turning into pdf ${briefing.pdfFilename}")
nytimes.io.toPDF(targetDirectory, briefing)
})
.tapEach(briefing => {
info(f"Uploading to remarkable ${briefing.pdfFilename}")
nytimes.io.uploadToRemarkable(targetDirectory, briefing)
})
}
}