generated from aboutcode-org/skeleton
-
Notifications
You must be signed in to change notification settings - Fork 23
/
maven.py
1633 lines (1350 loc) · 53.4 KB
/
maven.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from collections import namedtuple
import gzip
import hashlib
import io
import json
import logging
import re
from typing import Dict
from urllib.parse import urlparse
import arrow
import requests
from bs4 import BeautifulSoup
from dateutil import tz
from jawa.util.utf import decode_modified_utf8
import javaproperties
from packageurl import PackageURL
from packagedcode.maven import build_filename
from packagedcode.maven import build_url
from packagedcode.maven import get_urls
from packagedcode.maven import _parse
from packagedcode.maven import get_maven_pom
from packageurl import PackageURL
from minecode import seed
from minecode import priority_router
from minecode import visit_router
from minecode.visitors import java_stream
from minecode.visitors import HttpVisitor
from minecode.visitors import NonPersistentHttpVisitor
from minecode.visitors import URI
from packagedb.models import make_relationship
from packagedb.models import PackageContentType
from packagedb.models import PackageRelation
"""
This module handles the Maven repositories such as central and other
nexus-based maven repositories. This is dubbed the maven2 format for the
repository and support the v4 POM format.
Old Maven1 format repositories are not supported (e.g. with jars,
sources, poms directories and POM format v2/v3).
"""
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
TRACE = False
TRACE_DEEP = False
if TRACE:
logger.setLevel(logging.DEBUG)
MAVEN_BASE_URL = 'https://repo1.maven.org/maven2'
class GzipFileWithTrailing(gzip.GzipFile):
"""
A subclass of gzip.GzipFile supporting files with trailing garbage. Ignore
the garbage.
"""
# TODO: what is first_file??
first_file = True
gzip_magic = b'\037\213'
has_trailing_garbage = False
def _read_gzip_header(self):
# read the first two bytes
magic = self.fileobj.read(2)
# rewind two bytes back
self.fileobj.seek(-2, os.SEEK_CUR)
is_gzip = magic != self.gzip_magic
if is_gzip and not self.first_file:
self.first_file = False
self.has_trailing_garbage = True
raise EOFError('Trailing garbage found')
self.first_file = False
gzip.GzipFile._read_gzip_header(self)
class MavenSeed(seed.Seeder):
def get_seeds(self):
yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz'
yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties'
# yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz'
# yield 'http://jcenter.bintray.com/'
# yield 'https://repo2.maven.org/maven2/.index/nexus-maven-repository-index.gz'
# other repos: http://stackoverflow.com/a/161846/302521
# 1. google has a mirror https://www.infoq.com/news/2015/11/maven-central-at-google
# https://maven-central.storage.googleapis.com/repos/central/data/.index/nexus-maven-repository-index.properties
# 2. apache has a possible mirro at http://repo.maven.apache.org/maven2/.index/nexus-maven-repository-index.properties
# 3. ibiblio has an out of date mirror that has no directory listing and was last updated on 20161121171437
# clojars is not a mirror, but its own repo: https://clojars.org/repo/.index/
# other mirrors https://www.google.com/search?q=allinurl%3A%20.index%2Fnexus-maven-repository-index.properties&pws=0&gl=us&gws_rd=cr
# also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage
def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL):
"""
Return the contents of the POM file of the package described by the purl
field arguments in a string.
"""
# Create URLs using purl fields
if qualifiers and not isinstance(qualifiers, Dict):
return
urls = get_urls(
namespace=namespace,
name=name,
version=version,
qualifiers=qualifiers,
base_url=base_url,
)
# Get and parse POM info
pom_url = urls['api_data_url']
# TODO: manage different types of errors (404, etc.)
response = requests.get(pom_url)
if not response:
return
return response.text
def get_package_sha1(package):
"""
Return the sha1 value for `package` by checking if the sha1 file exists for
`package` on maven and returning the contents if it does.
If the sha1 is invalid, we download the package's JAR and calculate the sha1
from that.
"""
download_url = package.repository_download_url
sha1_download_url = f'{download_url}.sha1'
response = requests.get(sha1_download_url)
if response.ok:
sha1_contents = response.text.strip().split()
sha1 = sha1_contents[0]
sha1 = validate_sha1(sha1)
if not sha1:
# Download JAR and calculate sha1 if we cannot get it from the repo
response = requests.get(download_url)
if response:
sha1_hash = hashlib.new('sha1', response.content)
sha1 = sha1_hash.hexdigest()
return sha1
def fetch_parent(pom_text, base_url=MAVEN_BASE_URL):
"""
Return the parent pom text of `pom_text`, or None if `pom_text` has no parent.
"""
if not pom_text:
return
pom = get_maven_pom(text=pom_text)
if (
pom.parent
and pom.parent.group_id
and pom.parent.artifact_id
and pom.parent.version.version
):
parent_namespace = pom.parent.group_id
parent_name = pom.parent.artifact_id
parent_version = str(pom.parent.version.version)
parent_pom_text = get_pom_text(
namespace=parent_namespace,
name=parent_name,
version=parent_version,
qualifiers={},
base_url=base_url,
)
return parent_pom_text
def get_ancestry(pom_text, base_url=MAVEN_BASE_URL):
"""
Return a list of pom text of the ancestors of `pom`. The list is ordered
from oldest ancestor to newest. The list is empty is there is no parent pom.
"""
ancestors = []
has_parent = True
while has_parent:
parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url)
if not parent_pom_text:
has_parent = False
else:
ancestors.append(parent_pom_text)
pom_text = parent_pom_text
return reversed(ancestors)
def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL):
"""
Merge package details of a package with its ancestor pom
and return the merged package.
"""
if not package:
return
pom_text = get_pom_text(
name=package.name,
namespace=package.namespace,
version=package.version,
qualifiers=package.qualifiers,
base_url=base_url,
)
merged_package = merge_ancestors(
ancestor_pom_texts=get_ancestry(pom_text),
package=package,
)
return merged_package
def merge_parent(package, parent_package):
"""
Merge `parent_package` data into `package` and return `package.
"""
mergeable_fields = (
'declared_license_expression',
'homepage_url',
'parties',
)
for field in mergeable_fields:
# If `field` is empty on the package we're looking at, populate
# those fields with values from the parent package.
if not getattr(package, field):
value = getattr(parent_package, field)
setattr(package, field, value)
msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}'
history = package.extra_data.get('history')
if history:
package.extra_data['history'].append(msg)
else:
package.extra_data['history'] = [msg]
return package
def merge_ancestors(ancestor_pom_texts, package):
"""
Merge metadata from `ancestor_pom_text` into `package`.
The order of POM content in `ancestor_pom_texts` is expected to be in the
order of oldest ancestor to newest.
"""
for ancestor_pom_text in ancestor_pom_texts:
ancestor_package = _parse(
datasource_id='maven_pom',
package_type='maven',
primary_language='Java',
text=ancestor_pom_text
)
package = merge_parent(package, ancestor_package)
return package
def map_maven_package(package_url, package_content):
"""
Add a maven `package_url` to the PackageDB.
Return an error string if errors have occured in the process.
"""
from minecode.model_utils import add_package_to_scan_queue
from minecode.model_utils import merge_or_create_package
db_package = None
error = ''
if "repository_url" in package_url.qualifiers:
base_url = package_url.qualifiers["repository_url"]
else:
base_url = MAVEN_BASE_URL
pom_text = get_pom_text(
namespace=package_url.namespace,
name=package_url.name,
version=package_url.version,
qualifiers=package_url.qualifiers,
base_url=base_url,
)
if not pom_text:
msg = f'Package does not exist on maven: {package_url}'
error += msg + '\n'
logger.error(msg)
return db_package, error
package = _parse(
'maven_pom',
'maven',
'Java',
text=pom_text,
base_url=base_url,
)
ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url)
package = merge_ancestors(
ancestor_pom_texts=ancestor_pom_texts,
package=package
)
urls = get_urls(
namespace=package_url.namespace,
name=package_url.name,
version=package_url.version,
qualifiers=package_url.qualifiers,
base_url=base_url,
)
# In the case of looking up a maven package with qualifiers of
# `classifiers=sources`, the purl of the package created from the pom does
# not have the qualifiers, so we need to set them. Additionally, the download
# url is not properly generated since it would be missing the sources bit
# from the filename.
package.qualifiers = package_url.qualifiers
package.download_url = urls['repository_download_url']
package.repository_download_url = urls['repository_download_url']
# Set package_content value
package.extra_data['package_content'] = package_content
# If sha1 exists for a jar, we know we can create the package
# Use pom info as base and create packages for binary and source package
# Check to see if binary is available
sha1 = get_package_sha1(package)
if sha1:
package.sha1 = sha1
db_package, _, _, _ = merge_or_create_package(package, visit_level=50)
else:
msg = f'Failed to retrieve JAR: {package_url}'
error += msg + '\n'
logger.error(msg)
# Submit package for scanning
if db_package:
add_package_to_scan_queue(db_package)
return db_package, error
def validate_sha1(sha1):
"""
Validate a `sha1` string.
Return `sha1` if it is valid, None otherwise.
"""
if sha1 and len(sha1) != 40:
logger.warning(
f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!'
)
sha1 = None
return sha1
def map_maven_binary_and_source(package_url):
"""
Get metadata for the binary and source release of the Maven package
`package_url` and save it to the PackageDB.
Return an error string for errors that occur, or empty string if there is no error.
"""
error = ''
package, emsg = map_maven_package(
package_url,
PackageContentType.BINARY
)
if emsg:
error += emsg
source_package_url = package_url
source_package_url.qualifiers['classifier'] = 'sources'
source_package, emsg = map_maven_package(
source_package_url,
PackageContentType.SOURCE_ARCHIVE
)
if emsg:
error += emsg
if package and source_package:
make_relationship(
from_package=source_package,
to_package=package,
relationship=PackageRelation.Relationship.SOURCE_PACKAGE
)
return error
def map_maven_packages(package_url):
"""
Given a valid `package_url` with no version, get metadata for the binary and
source release for each version of the Maven package `package_url` and save
it to the PackageDB.
Return an error string for errors that occur, or empty string if there is no error.
"""
error = ''
namespace = package_url.namespace
name = package_url.name
# Find all versions of this package
query_params = f'g:{namespace}+AND+a:{name}'
url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav'
response = requests.get(url)
if response:
package_listings = response.json().get('response', {}).get('docs', [])
for listing in package_listings:
purl = PackageURL(
type='maven',
namespace=listing.get('g'),
name=listing.get('a'),
version=listing.get('v')
)
emsg = map_maven_binary_and_source(purl)
if emsg:
error += emsg
return error
@priority_router.route('pkg:maven/.*')
def process_request(purl_str):
"""
Process `priority_resource_uri` containing a maven Package URL (PURL) as a
URI.
This involves obtaining Package information for the PURL from maven and
using it to create a new PackageDB entry. The package is then added to the
scan queue afterwards. We also get the Package information for the
accompanying source package and add it to the PackageDB and scan queue, if
available.
Return an error string for errors that occur, or empty string if there is no error.
"""
try:
package_url = PackageURL.from_string(purl_str)
except ValueError as e:
error = f'error occured when parsing {purl_str}: {e}'
return error
has_version = bool(package_url.version)
if has_version:
error = map_maven_binary_and_source(package_url)
else:
error = map_maven_packages(package_url)
return error
collect_links = re.compile(r'href="([^"]+)"').findall
collect_links_and_artifact_timestamps = re.compile(
r'<a href="([^"]+)".*</a>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
).findall
def check_if_file_name_is_linked_on_page(file_name, links, **kwargs):
"""
Return True if `file_name` is in `links`
"""
return any(l.endswith(file_name) for l in links)
def check_if_page_has_pom_files(links, **kwargs):
"""
Return True of any entry in `links` ends with .pom.
"""
return any(l.endswith('.pom') for l in links)
def check_if_page_has_directories(links, **kwargs):
"""
Return True if any entry, excluding "../", ends with /.
"""
return any(l.endswith('/') for l in links if l != '../')
def check_if_package_version_page(links, **kwargs):
"""
Return True if `links` contains pom files and has no directories
"""
return (
check_if_page_has_pom_files(links=links)
and not check_if_page_has_directories(links=links)
)
def check_if_package_page(links, **kwargs):
return (
check_if_file_name_is_linked_on_page(file_name='maven-metadata.xml', links=links)
and not check_if_page_has_pom_files(links=links)
)
def check_if_maven_root(links, **kwargs):
"""
Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven
repo contains "archetype-catalog.xml".
"""
return check_if_file_name_is_linked_on_page(file_name='archetype-catalog.xml', links=links)
def check_on_page(url, checker):
"""
Return True if there is a link on `url` that is the same as `file_name`,
False otherwise.
"""
response = requests.get(url)
if response:
links = collect_links(response.text)
return checker(links=links)
return False
def is_maven_root(url):
"""
Return True if `url` is the root of a Maven repo, False otherwise.
"""
return check_on_page(url, check_if_maven_root)
def is_package_page(url):
"""
Return True if `url` is a package page on a Maven repo, False otherwise.
"""
return check_on_page(url, check_if_package_page)
def is_package_version_page(url):
"""
Return True if `url` is a package version page on a Maven repo, False otherwise.
"""
return check_on_page(url, check_if_package_version_page)
def url_parts(url):
parsed_url = urlparse(url)
scheme = parsed_url.scheme
netloc = parsed_url.netloc
path_segments = [p for p in parsed_url.path.split('/') if p]
return scheme, netloc, path_segments
def create_url(scheme, netloc, path_segments):
url_template = f'{scheme}://{netloc}'
path = '/'.join(path_segments)
return f'{url_template}/{path}'
def get_maven_root(url):
"""
Given `url`, that is a URL to namespace, package, or artifact in a Maven
repo, return the URL to the root of that repo. If a Maven root cannot be
determined, return None.
>>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/')
'https://repo1.maven.org/maven2'
"""
scheme, netloc, path_segments = url_parts(url)
for i in range(len(path_segments)):
segments = path_segments[:i+1]
url_segment = create_url(scheme, netloc, segments)
if is_maven_root(url_segment):
return url_segment
return None
def determine_namespace_name_version_from_url(url, root_url=None):
"""
Return a 3-tuple containing strings of a Package namespace, name, and
version, determined from `url`, where `url` points to namespace, package,
specific package version, or artifact on a Maven repo.
Return None if a Maven root cannot be determined from `url`.
>>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/')
('net.shibboleth', 'parent', '7.11.0')
"""
if not root_url:
root_url = get_maven_root(url)
if not root_url:
raise Exception(f'Error: not a Maven repository: {url}')
_, remaining_path_segments = url.split(root_url)
remaining_path_segments = remaining_path_segments.split('/')
remaining_path_segments = [p for p in remaining_path_segments if p]
namespace_segments = []
package_name = ''
package_version = ''
for i in range(len(remaining_path_segments)):
segment = remaining_path_segments[i]
segments = remaining_path_segments[:i+1]
path = '/'.join(segments)
url_segment = f'{root_url}/{path}'
if is_package_page(url_segment):
package_name = segment
elif is_package_version_page(url_segment):
package_version = segment
else:
namespace_segments.append(segment)
namespace = '.'.join(namespace_segments)
return namespace, package_name, package_version
def add_to_import_queue(url, root_url):
"""
Create ImportableURI for the Maven repo package page at `url`.
"""
from minecode.models import ImportableURI
data = None
response = requests.get(url)
if response:
data = response.text
namespace, name, _ = determine_namespace_name_version_from_url(url, root_url)
purl = PackageURL(
type='maven',
namespace=namespace,
name=name,
)
importable_uri = ImportableURI.objects.insert(url, data, purl)
if importable_uri:
logger.info(f'Inserted {url} into ImportableURI queue')
def filter_only_directories(timestamps_by_links):
"""
Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`),
"""
timestamps_by_links_filtered = {}
for link, timestamp in timestamps_by_links.items():
if link != '../' and link.endswith('/'):
timestamps_by_links_filtered[link] = timestamp
return timestamps_by_links_filtered
valid_artifact_extensions = [
'ejb3',
'ear',
'aar',
'apk',
'gem',
'jar',
'nar',
# 'pom',
'so',
'swc',
'tar',
'tar.gz',
'war',
'xar',
'zip',
]
def filter_for_artifacts(timestamps_by_links):
"""
Given a mapping of `timestamps_by_links`, where the links are the filenames
of Maven artifacts, return a mapping of filenames whose extension is in
`valid_artifact_extensions` and their timestamps.
"""
timestamps_by_links_filtered = {}
for link, timestamp in timestamps_by_links.items():
for ext in valid_artifact_extensions:
if link.endswith(ext):
timestamps_by_links_filtered[link] = timestamp
return timestamps_by_links_filtered
def collect_links_from_text(text, filter):
"""
Return a mapping of link locations and their timestamps, given HTML `text`
content, that is filtered using `filter`.
"""
links_and_timestamps = collect_links_and_artifact_timestamps(text)
timestamps_by_links = {}
for link, timestamp in links_and_timestamps:
if timestamp == '-':
timestamp = ''
timestamps_by_links[link] = timestamp
timestamps_by_links = filter(timestamps_by_links=timestamps_by_links)
return timestamps_by_links
def create_absolute_urls_for_links(text, url, filter):
"""
Given the `text` contents from `url`, return a mapping of absolute URLs to
links from `url` and their timestamps, that is then filtered by `filter`.
"""
timestamps_by_absolute_links = {}
url = url.rstrip('/')
timestamps_by_links = collect_links_from_text(text, filter)
for link, timestamp in timestamps_by_links.items():
if not link.startswith(url):
link = f'{url}/{link}'
timestamps_by_absolute_links[link] = timestamp
return timestamps_by_absolute_links
def get_directory_links(url):
"""
Return a list of absolute directory URLs of the hyperlinks from `url`
"""
timestamps_by_directory_links = {}
response = requests.get(url)
if response:
timestamps_by_directory_links = create_absolute_urls_for_links(
response.text,
url=url,
filter=filter_only_directories
)
return timestamps_by_directory_links
def get_artifact_links(url):
"""
Return a list of absolute directory URLs of the hyperlinks from `url`
"""
timestamps_by_artifact_links = []
response = requests.get(url)
if response:
timestamps_by_artifact_links = create_absolute_urls_for_links(
response.text,
url=url,
filter=filter_for_artifacts
)
return timestamps_by_artifact_links
def crawl_to_package(url, root_url):
"""
Given a maven repo `url`,
"""
if is_package_page(url):
add_to_import_queue(url, root_url)
return
for link in get_directory_links(url):
crawl_to_package(link, root_url)
def crawl_maven_repo_from_root(root_url):
"""
Given the `url` to a maven root, traverse the repo depth-first and add
packages to the import queue.
"""
crawl_to_package(root_url, root_url)
def get_artifact_sha1(artifact_url):
"""
Return the SHA1 value of the Maven artifact located at `artifact_url`.
"""
sha1 = None
artifact_sha1_url = f'{artifact_url}.sha1'
response = requests.get(artifact_sha1_url)
if response:
sha1_contents = response.text.strip().split()
sha1 = sha1_contents[0]
sha1 = validate_sha1(sha1)
return sha1
def get_classifier_from_artifact_url(artifact_url, package_version_page_url, package_name, package_version):
"""
Return the classifier from a Maven artifact URL `artifact_url`, otherwise
return None if a classifier cannot be determined from `artifact_url`
"""
classifier = None
# https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0
package_version_page_url = package_version_page_url.rstrip('/')
# https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0
leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}'
# artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar'
# ['', '-onejar.jar']
_, remaining_url_portion = artifact_url.split(leading_url_portion)
# ['-onejar', 'jar']
remaining_url_portions = remaining_url_portion.split('.')
if remaining_url_portions and remaining_url_portions[0]:
# '-onejar'
classifier = remaining_url_portions[0]
if classifier.startswith('-'):
# 'onejar'
classifier = classifier[1:]
return classifier
@visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties')
@visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties')
class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor):
"""
Fetch the property files, parse the create the URI for each increment index
"""
def get_uris(self, content):
"""
Parse a NEXUS index properties file and yield increment index URIs
This file is a Java properties file with rows likes this:
nexus.index.incremental-15=526
nexus.index.incremental-14=527
Each value points to a fragment increamental index that has the same
format as the bigger one.
"""
base_url = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz'
with open(content) as config_file:
properties = javaproperties.load(config_file) or {}
for key, increment_index in properties.items():
if key.startswith('nexus.index.incremental'):
yield URI(
uri=base_url.format(index=increment_index),
source_uri=self.uri,
)
@visit_router.route(
'https?://.*/nexus-maven-repository-index.gz',
# increments
'https?://.*/nexus-maven-repository-index\.\d+\.gz')
class MavenNexusIndexVisitor(NonPersistentHttpVisitor):
"""
Download and process a Nexus Maven index file.
WARNING: Processing is rather long: a full index is ~600MB.
"""
def get_uris(self, content):
"""
Yield a combo of pre-visited URIs with a special maven-index://
scheme together with other regular fetchable URIs for POMs and
JARs found in a Maven index.
For NonPersistentHttpVisitor content is the path to the temp Gzipped
index file, not the actual file content.
"""
index_location = content
artifacts = get_artifacts(index_location, worthyness=is_worthy_artifact)
for artifact in artifacts:
# we cannot do much without these
group_id = artifact.group_id
artifact_id = artifact.artifact_id
version = artifact.version
extension = artifact.extension
if not (group_id and artifact_id and version and extension):
continue
qualifiers = {}
if extension and extension != 'jar':
qualifiers['type'] = extension
classifier = artifact.classifier
if classifier:
qualifiers['classifier'] = classifier
package_url = PackageURL(
type='maven',
namespace=group_id,
name=artifact_id,
version=version,
qualifiers=qualifiers or None,
)
# FIXME: also use the Artifact.src_exist flags too?
# build a URL: This is the real JAR download URL
# FIXME: this should be set at the time of creating Artifacts
# instead togther with the filename... especially we could use
# different REPOs.
jar_download_url, file_name = build_url_and_filename(
group_id, artifact_id, version, extension, classifier)
# FIXME: should this be set in the yielded URI too
last_mod = artifact.last_modified
# We yield a pre-visited URI for each JAR
mock_maven_index_uri = build_url(
group_id, artifact_id, version, file_name,
base_url='maven-index://repo1.maven.org')
artifact_data = artifact.to_dict()
artifact_data['download_url'] = jar_download_url
artifact_as_json = json.dumps(artifact_data, separators=(',', ':'))
yield URI(
# this is the Maven index index URI
source_uri=self.uri,
# FIXME: remove these mock URIs after migration
uri=mock_maven_index_uri,
package_url=package_url.to_string(),
visited=True,
mining_level=0,
file_name=file_name,
size=artifact.size,
sha1=artifact.sha1,
date=last_mod,
data=artifact_as_json,
)
package_url = PackageURL(
type='maven',
namespace=group_id,
name=artifact_id,
version=version,
)
# also yield a POM for this. There are no artifacts for
# the POM of a Jar in the repo. Only for Parent POMs
# therefore we create a download with the pomextension
pom_download_url, pom_file_name = build_url_and_filename(
group_id, artifact_id, version, extension='pom', classifier='')
yield URI(
# this is the Maven index index URI
source_uri=self.uri,
uri=pom_download_url,
# use the same PURL as the main jar
package_url=package_url.to_string(),
visited=False,
mining_level=20,
file_name=pom_file_name,
size=0,
date=last_mod,
)
@visit_router.route('https?://jcenter\.bintray\.com/(.+/)*')
class MavenHTMLPageVisitor(HttpVisitor):
"""
Parse the HTML page and yield all necessary uris from the page and its sub pages.
Note that the regex of the route expression is using . to map any characters except new line is becasue of the case:
http://jcenter.bintray.com/'com/virtualightning'/, this is in the test too.
"""
def get_uris(self, content):
page = BeautifulSoup(content, 'lxml')
for pre in page.find_all(name='pre'):
for a in pre.find_all(name='a'):
url = a.get('href')
if not url:
continue
if url.startswith(':'): # Remove : symbol since it's a special char for bintray repo.
url = url[1:]
filename = None # default is folder, the filename is None.
if not url.endswith('/'):
# a file
filename = url
yield URI(
uri=self.uri + url,
visited=False,
file_name=filename,
source_uri=self.uri,
)
@visit_router.route('https?://.*/maven-metadata\.xml')
class MavenMetaDataVisitor(HttpVisitor):
"""
Parse the maven-metadata.xml file and yield uris of jars and pom.
"""
def get_uris(self, content):
# FIXME this may not be correct. The only thing we can infer from the maven
# metadata is wha are the groupid/artifactid and available versions
# The actual download files likely need to be obtained from directory listing
# or infered from parsing the POM???
base_url = self.uri.partition('maven-metadata.xml')[0] + '{version}/'
pom_url = base_url + '{artifactId}-{version}.pom'
# FIXME: this may not exist and or with another extension?? and this should be PREVISITED
jar_url = base_url + '{artifactId}-{version}.jar'
# FIXME: sources may not exists?? and this should be PREVISITED
source_url = base_url + '{artifactId}-{version}-sources.jar'
# FIXME: why use BeautifulSoup for valid XML???
page = BeautifulSoup(content, 'lxml-xml')
group_id = page.find(name='groupId')
artifact_id = page.find(name='artifactId')
if not (group_id and artifact_id):
return
group_id = group_id.string
artifact_id = artifact_id.string
for version in page.find_all('version'):
version = version.string
# FIXME: we may not get the proper extensions and classifiers and miss the qualifiers