-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.kt
executable file
·136 lines (120 loc) · 5.03 KB
/
Main.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/*
* @author Adit Shah
*
* This file includes the core logic for this project, including the logic to find and filter links using a modified
* Breadth-first search algorithm, and sorting the list with a heap
*/
import kotlinx.serialization.UnstableDefault
import org.jsoup.Jsoup
import java.io.FileNotFoundException
import java.util.*
var graph = DirectedGraph()
var prefix = "https://simple.wikipedia.org/wiki/"
/* This function checks whether a link is actually a wikipedia article that should be indexed. It removes special pages
* (these usually have a colon) and ensures that the page is on the desired subdomain of wikipedia (designated as prefix).
* It also ensures that links to itself (incl. the table of contents) are not included
*/
fun checkLink(url: String, originURL: String): Boolean {
return url.startsWith(prefix) and !url.contains(originURL) and
(url.asSequence().filter { it == ':' }.count() < 2) and
!url.contains("#") and !url.contains("wikimedia")
}
/*
* This function utilizes the JSoup API to find all of the links on a page, and returns them if they pass the checkLink method above
*/
fun getLinks(url: String): Collection<String> {
val doc = Jsoup.connect(url).get()
val content = doc.select("div.mw-parser-output")
.select("p").select("a[href]")
val list = ArrayList<String>()
for (link in content) {
val newURL = link.attr("abs:href")
if (checkLink(newURL, url)) {
list.add(newURL)
}
}
return list
}
/*
* This is where the core logic of the program is. It utilizes a modified breadth first search algorithm to find links
* between pages of wikipedia and create a directed graph to represent the relationships between pages
*/
fun findLinks(startingURL: String, numLinks: Int, saveFrequency: Int, serializer: Serializer, filename: String) {
val visited = HashSet<String>() //Nodes that have already been visited
val queue = LinkedList<Vertex>()
queue.add(Vertex(startingURL))
var i = 0
while (queue.isNotEmpty() && i < numLinks) {
i++
if (i % saveFrequency == 0) {
serializer.write(graph.vertices, filename)
println(" $i")
}
val url = queue.remove().label
try {
for (newURL in getLinks(url)) {
graph.addEdge(url, newURL)
if (!visited.contains(newURL)) {
visited.add(newURL)
queue.add(graph.vertices[newURL]!!)
}
}
} catch (e: org.jsoup.HttpStatusException) {
println("incorrect URL: $url")
}
}
}
/*
* This function utilizes the Java standard library to create a priority queue from the vertices (through the compareTo
* which compares the number of neighbors -- see DirectedGraph.java for more info). It then prints out the first *count*
* vertices, which correspond to the pages with the highest number of links pointing to them
*/
fun rankPages(count: Int, direction: String = "max") { //: PriorityQueue<Int> {
val pq = when (direction) {
"max" -> PriorityQueue(Collections.reverseOrder<Vertex>())
"min" -> PriorityQueue()
else -> throw IllegalArgumentException("$direction is invalid")
}
pq.addAll(graph.vertices.values)
repeat(count) {
val v = pq.poll()
val article = v.label.removePrefix(prefix)
println("$it: $article (${v.neighbors.size})")
}
}
/* The main method is where everything gets put together. It loads the graph into memory (see Serializer.kt), starts the graph creation
* at a random page, runs the graph creation for a specified number of links, then ranks the pages. Finally, it saves the graph. */
@UnstableDefault
fun main() {
val filename = "simple.json"
prefix = "https://simple.wikipedia.org/wiki/"
//val filename = "en.json"
//prefix = "https://en.wikipedia.org/wiki/"
//val serializer : Serializer = JavaSerializer<>()
val serializer = JSONSerializer()
try {
val vertices = serializer.read(filename)
graph = DirectedGraph(vertices)
println("Successfully loaded $filename")
} catch (f: FileNotFoundException) {
println("File $filename not found, creating file")
} catch (e: StackOverflowError) {
println("Stack overflow, making a new graph")
}
repeat(1) {
println("Loop $it:")
findLinks(prefix + "Special:Random", 10, 250, serializer, filename)
rankPages(50)
//rankPages(50, "min")
serializer.write(graph.vertices, filename)
println(graph.vertices.size)
}
}
/* Links and Sources:
* https://jsoup.org/cookbook/extracting-data/example-list-links
* https://www.geeksforgeeks.org/serialization-in-java/
* https://www.baeldung.com/jackson-object-mapper-tutorial
* https://stackoverflow.com/questions/11003155/change-priorityqueue-to-max-priorityqueue
* Used the Graph code we wrote (including your starter code) as a baseline but converted
* it to Kotlin with the Java-> Kotlin converter, and made it a directed graph
*/