-
Notifications
You must be signed in to change notification settings - Fork 99
/
process_licenses.py
186 lines (165 loc) · 9.17 KB
/
process_licenses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
# This script processes licenses of 3rd party dependencies and stores them in the JAR. The rules are:
# 1. Dependencies, which contains a license file should be put into the shaded JAR as-is.
# 2. Dependencies, which do not contain a license file should be mentioned in the file ADDITIONAL_LICENCES, together with the name of its license.
#
#
# The script accepts the following arguments:
# * DEPENDENCY_LIST_FILE_PATH
# * Can be obtained by running mvn dependency:list -DincludeScope=runtime -DoutputFile=target/dependency_list.txt
# * DEPENDENCIES_DIR
# * Directory containing the JAR files of all SDK dependencies. Automatically generated by `mvn clean package` in target/dependency-jars
# * TARGET_DIR
# * Where to save all output, should be target/generated-sources/META-INF/third-party-licenses
#
#
# Useful mvn commands:
# * mvn clean license:add-third-party
# * Generate dependency report; useful to find out licenses for dependencies that don't ship with a license file
# * mvn dependency:list -DincludeScope=runtime -DoutputFile=target/dependency_list.txt
# * Used as input of this script (DEPENDENCY_LIST_FILE_PATH)
import sys
from pathlib import Path
from zipfile import ZipFile
# License name constants
APACHE_LICENSE = "Apache License 2.0"
BSD_2_CLAUSE_LICENSE = "2-Clause BSD License"
BSD_3_CLAUSE_LICENSE = "3-Clause BSD License"
EDL_10_LICENSE = "EDL 1.0"
MIT_LICENSE = "The MIT License"
GO_LICENSE = "The Go license"
BOUNCY_CASTLE_LICENSE = "Bouncy Castle License"
LGPL = "LGPL License"
# The SDK does not need to include licenses of dependencies, which aren't shaded
IGNORED_DEPENDENCIES = {"net.snowflake:snowflake-jdbc", "org.slf4j:slf4j-api"}
# List of dependencies, which don't ship with a license file.
# Only add a new record here after verifying that the dependency JAR does not contain a license!
ADDITIONAL_LICENSES_MAP = {
"com.eclipsesource.minimal-json:minimal-json": MIT_LICENSE,
"com.fasterxml.jackson.dataformat:jackson-dataformat-protobuf": APACHE_LICENSE,
"com.github.ben-manes.caffeine:caffeine": APACHE_LICENSE,
"com.github.luben:zstd-jni": BSD_2_CLAUSE_LICENSE,
"com.google.code.findbugs:jsr305": APACHE_LICENSE,
"com.google.crypto.tink:tink": APACHE_LICENSE,
"com.google.errorprone:error_prone_annotations": APACHE_LICENSE,
"com.google.code.findbugs:annotations": LGPL,
"com.google.code.gson:gson": APACHE_LICENSE,
"com.google.guava:failureaccess": APACHE_LICENSE,
"com.google.guava:listenablefuture": APACHE_LICENSE,
"com.google.j2objc:j2objc-annotations": APACHE_LICENSE,
"com.google.protobuf:protobuf-java": BSD_3_CLAUSE_LICENSE,
"com.google.protobuf:protobuf-java-util": BSD_3_CLAUSE_LICENSE,
"com.google.re2j:re2j": GO_LICENSE,
"com.hubspot.jackson:jackson-datatype-protobuf": APACHE_LICENSE,
"com.ibm.jsonata4java:JSONata4Java": APACHE_LICENSE,
"com.squareup:protoparser": APACHE_LICENSE,
"dev.failsafe:failsafe": APACHE_LICENSE,
"info.picocli:picocli": APACHE_LICENSE,
"io.confluent:common-utils": APACHE_LICENSE,
"io.confluent:dek-registry-client": APACHE_LICENSE,
"io.confluent:kafka-avro-serializer": APACHE_LICENSE,
"io.confluent:kafka-connect-avro-converter": APACHE_LICENSE,
"io.confluent:kafka-connect-avro-data": APACHE_LICENSE,
"io.confluent:kafka-schema-converter": APACHE_LICENSE,
"io.confluent:kafka-schema-registry-client": APACHE_LICENSE,
"io.confluent:kafka-schema-registry-client-encryption": APACHE_LICENSE,
"io.confluent:kafka-schema-registry-client-encryption-tink": APACHE_LICENSE,
"io.confluent:kafka-schema-rules": APACHE_LICENSE,
"io.confluent:kafka-schema-serializer": APACHE_LICENSE,
"io.confluent:logredactor": APACHE_LICENSE,
"io.confluent:logredactor-metrics": APACHE_LICENSE,
"io.dropwizard.metrics:metrics-core": APACHE_LICENSE,
"io.dropwizard.metrics:metrics-jmx": APACHE_LICENSE,
"io.dropwizard.metrics:metrics-jvm": APACHE_LICENSE,
"io.swagger.core.v3:swagger-annotations": APACHE_LICENSE,
"net.snowflake:snowflake-kafka-connector": APACHE_LICENSE,
"net.snowflake:snowflake-ingest-sdk": APACHE_LICENSE,
"org.agrona:agrona": APACHE_LICENSE,
"org.antlr:antlr4-runtime": BSD_3_CLAUSE_LICENSE,
"org.apache.kafka:kafka-clients" : APACHE_LICENSE,
"org.apache.parquet:parquet-common": APACHE_LICENSE,
"org.apache.parquet:parquet-format-structures": APACHE_LICENSE,
"org.bouncycastle:bc-fips": BOUNCY_CASTLE_LICENSE,
"org.bouncycastle:bcpkix-fips": BOUNCY_CASTLE_LICENSE,
"org.projectnessie.cel:cel-core": APACHE_LICENSE,
"org.projectnessie.cel:cel-generated-antlr": APACHE_LICENSE,
"org.projectnessie.cel:cel-generated-pb": APACHE_LICENSE,
"org.projectnessie.cel:cel-jackson": APACHE_LICENSE,
"org.projectnessie.cel:cel-tools": APACHE_LICENSE,
"org.xerial.snappy:snappy-java": APACHE_LICENSE,
"org.yaml:snakeyaml": APACHE_LICENSE,
"org.apache.iceberg:iceberg-api": APACHE_LICENSE,
"org.apache.iceberg:iceberg-core": APACHE_LICENSE,
"org.apache.iceberg:iceberg-common": APACHE_LICENSE,
"io.airlift:aircompressor": APACHE_LICENSE,
"org.roaringbitmap:RoaringBitmap": APACHE_LICENSE
}
def parse_cmdline_args():
if len(sys.argv) != 4:
raise Exception("usage: process_licenses.py DEPENDENCY_LIST_FILE_PATH DEPENDENCIES_DIR TARGET_DIR")
dependency_list_file_path = Path(sys.argv[1]).absolute()
dependencies_dir_path = Path(sys.argv[2]).absolute()
target_dir = Path(sys.argv[3]).absolute()
if not dependency_list_file_path.exists() or not dependency_list_file_path.is_file():
raise Exception(f"File {dependency_list_file_path} does not exist")
if not dependencies_dir_path.exists() or not dependencies_dir_path.is_dir():
raise Exception(f"Directory {dependencies_dir_path} does not exist")
return dependency_list_file_path, dependencies_dir_path, target_dir
def main():
dependency_list_path, dependency_jars_path, target_dir = parse_cmdline_args()
dependency_count = 0
dependency_with_license_count = 0
dependency_without_license_count = 0
dependency_ignored_count = 0
missing_licenses_str = ""
target_dir.mkdir(parents=True, exist_ok=True)
with open(dependency_list_path, "r") as dependency_file_handle:
for line in dependency_file_handle.readlines():
line = line.strip()
if line == "" or line == "The following files have been resolved:":
continue
dependency_count += 1
# Line is a string like: "commons-codec:commons-codec:jar:1.15:compile -- module org.apache.commons.codec [auto]"
artifact_details = line.split()[0]
group_id, artifact_id, _, version, scope = artifact_details.split(":")
current_jar = Path(dependency_jars_path, f"{artifact_id}-{version}.jar")
if not current_jar.exists() and current_jar.is_file():
raise Exception(f"Expected JAR file does not exist: {current_jar}")
current_jar_as_zip = ZipFile(current_jar)
dependency_lookup_key = f"{group_id}:{artifact_id}"
if dependency_lookup_key in IGNORED_DEPENDENCIES:
dependency_ignored_count += 1
continue
license_found = False
for zip_info in current_jar_as_zip.infolist():
if zip_info.is_dir():
continue
if zip_info.filename in ("META-INF/LICENSE.txt", "META-INF/LICENSE", "META-INF/LICENSE.md"):
license_found = True
dependency_with_license_count += 1
# Extract license to the target directory
zip_info.filename = f"LICENSE_{group_id}__{artifact_id}"
current_jar_as_zip.extract(zip_info, target_dir)
break
if "license" in zip_info.filename.lower(): # Log potential license matches
print(f"Potential license match: {current_jar} {zip_info}")
if not license_found:
print(f"License not found {current_jar}; using value from ADDITIONAL_LICENSES_MAP")
license_name = ADDITIONAL_LICENSES_MAP.get(dependency_lookup_key)
if license_name:
dependency_without_license_count += 1
missing_licenses_str += f"{dependency_lookup_key}: {license_name}\n"
else:
err_msg = f"The dependency {dependency_lookup_key} does not ship a license file, but neither is it not defined in ADDITIONAL_LICENSES_MAP"
raise Exception(err_msg)
with open(Path(target_dir, "ADDITIONAL_LICENCES"), "w") as additional_licenses_handle:
additional_licenses_handle.write(missing_licenses_str)
if dependency_count < 30:
raise Exception(f"Suspiciously low number of dependency JARs detected in {dependency_jars_path}: {dependency_count}")
print("License generation finished")
print(f"\tTotal dependencies: {dependency_count}")
print(f"\tTotal dependencies (with license): {dependency_with_license_count}")
print(f"\tTotal dependencies (without license): {dependency_without_license_count}")
print(f"\tIgnored dependencies: {dependency_ignored_count}")
if __name__ == "__main__":
main()