From 0a4915abf741a4f7f8017db24057234eae4b9738 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Thu, 16 Jun 2022 10:46:39 +0530 Subject: [PATCH 01/59] create an app to detect licenses from input text Signed-off-by: Akhil Raj --- scancodeio/settings.py | 3 +- scancodeio/static/favicon.ico | Bin 0 -> 15406 bytes scancodeio/urls.py | 1 + scanpipe/templates/scanpipe/base.html | 1 + scantext/__init__.py | 21 ++ scantext/admin.py | 25 ++ scantext/apps.py | 28 ++ scantext/forms.py | 64 ++++ scantext/migrations/__init__.py | 0 scantext/models.py | 25 ++ .../includes/license_summary_level.html | 35 ++ .../templates/scantext/license_detail.html | 107 ++++++ scantext/templates/scantext/license_scan.html | 84 +++++ scantext/tests.py | 25 ++ scantext/urls.py | 29 ++ scantext/views.py | 312 ++++++++++++++++++ 16 files changed, 759 insertions(+), 1 deletion(-) create mode 100644 scancodeio/static/favicon.ico create mode 100644 scantext/__init__.py create mode 100644 scantext/admin.py create mode 100644 scantext/apps.py create mode 100644 scantext/forms.py create mode 100644 scantext/migrations/__init__.py create mode 100644 scantext/models.py create mode 100644 scantext/templates/scantext/includes/license_summary_level.html create mode 100644 scantext/templates/scantext/license_detail.html create mode 100644 scantext/templates/scantext/license_scan.html create mode 100644 scantext/tests.py create mode 100644 scantext/urls.py create mode 100644 scantext/views.py diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 665dd3eb5..63c9dc4da 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -45,7 +45,7 @@ ALLOWED_HOSTS = env.list("ALLOWED_HOSTS", default=[".localhost", "127.0.0.1", "[::1]"]) # SECURITY WARNING: don't run with debug turned on in production -DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) +DEBUG = env.bool("SCANCODEIO_DEBUG", default=True) SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False @@ -78,6 +78,7 @@ # Local apps # Must come before Third-party apps for proper templates override "scanpipe", + "scantext", # Django built-in "django.contrib.auth", "django.contrib.contenttypes", diff --git a/scancodeio/static/favicon.ico b/scancodeio/static/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..e459dba5ac9a21d6dc81430b10b8d4da5a7d366b GIT binary patch literal 15406 zcmeHO36vJa5$>Mp?w>kMTq*8XHXI}L=pvKf+A>Q@M6IQyigHQ zNEC4+AP6FYpooI7Z-1; zuBxsogpk4&6%_*SN-L^4l+6E1(@9!tXuC78>=5`7daz7N#^N4lX>o4a;MBE^Th80_ypgd{}%~Yk-B~F4SbKEMX5-s-jGB5MnHTqyUivb*6K=*_*r%;~Ngi2A&hQJ# ztbdElv+t0)c9G?4bPhFbA&E@uU=N)U;~)p`TAXItt0a#d81CQEeaY1RFiFpr_sA@N zoy;?TOZM%A{9u30>R$(P5yjtt?}nU#hMYkQo2nE_Xk`Pv~wVQE@5O=BYjWWI&)}2lFq2zfs?(*-_ZR;W9nR#kA>ASa(nz@ox%^K1V zeUCCPzF!=E*ipUY2Nq7vS_PZ(ZQ3RLoW`KR z!2e*MPj)^kz^|^nJHRhmx3%rE>ZG8p!FVB~YS!cAoOXWbKh>WldHkt?pJm{O`LgW{ zJ*QFdvv131z@ImGw?2a0&&;sCpTEAw#;>kk5XTGOk$p^cLv8x~0dg)t7t`2K{+nms z33;szxmVnl6Q{p^eh2yzbY%P4(Q$>o^@44|+TBZM7QL75XR8A^J@46~;j-z`_iYd6 zalde9>O6YUH_2q(8O)mB;BIETJ*XTI`1jfK^F1iLE&7}FrD)UMqoI9g(zmR({AU?s z9qN8!n$>fB-ZRhrBQ{r*2ePt1>D!;SbQs+8^jE)4&d7^Nbm)wG=FekrQT+VPdOwi& z`tz3feME(wGX@Kr>x>#-5Z3wVn7sHc4Rh9##9b(Xm(K!hlzH-v^f_R|)bxjOFYBFz zZQqI9OJ_lE6z2!ndpYjx`nC;m30R`lza&8-_SAt*MA%*L5Nmdh9_^F1pQ(wbuc?<0`_eeWM)ru z$xRV>_b`o2E7Q!R^BZuB@$4-E{&3MTOzUEw@0mK<*MeyGcnF#!GIO}_wIn)PfV>x4 z&*6Ml3(;QgZ@IordGE@5SF{Y6b$=j@Sg%5E(7kX9xgVcQ?)d3mEOOav$(Uy;yxod8 z=^#l0s9s5;T}K;tI7)ZFRY8fam?D* zYShby>~th?%&_2j$9Xgx{uG-lL2s`0m(U(NGmtrj_*8%g`4=l^5xI6CGeLg|c$x%; zWBtf+1ZM!Ur|;Ml(5JZ`rt1g7ew=;Hd)wS}*W*{y=LZAX zfgRI0s{ww%xR>1$vsKR6sbT-2kRj9+>HF(!9+zvgtgOovpTcBptyU}_4qfJcI?TcH zI^9cdIyCfiju*LA=TzkCvU1e{zgee%=dAP9jOEFAcyrMYe@R_6-`1DKXk~i+r(Gv< zP8p3eu)x;p1u_$riDr@?J!3^+y90WeZqy1ogGV7BvXsnYKZVcvSJFPt@lXW7kW?pi1+XDLb7{;0|JK%o{RDl>c+Xh{Cn)}VZ3RD4dbPN~7w{U=_kr)Q?&Y_7^{Z*P zbICB)EPM6~*MM4EtKX+CwDqW*duF}E^;dlNkUfs^d}F4GYh82XqHSICQGLRBc|6zu zo;L~hai^CP=Y0e}6l#`jU9o9JqE%byyCvy-$*~4~=cbs>h{{vmZ`?24abHa5DfFzW zevD+N?%^8z{V4F}JE5(0%R#S|g=imqVs#B_&LF3G>Gkn4FHy&(ut!`2o|w;{o8tpq z3)g>G>^_qh2kPFq);fBB%{jl1hu`c!aE;i=Plfd-KE07Q*B>ePXa;p)wgx*oN7^@^ znYC|t_PtA3yOdZ1*Y$919`gMMtGn$w8Q;1$;a9HqVptbnhkCY~?76b{(p+_eX|!jz z@qXYOaeWZS#JJv0O}@*kU^ literal 0 HcmV?d00001 diff --git a/scancodeio/urls.py b/scancodeio/urls.py index a8bd62ce9..5d2dc7409 100644 --- a/scancodeio/urls.py +++ b/scancodeio/urls.py @@ -52,6 +52,7 @@ path("admin/", admin.site.urls), path("api/", include(api_router.urls)), path("license/", include(licenses.urls)), + path("scan/", include("scantext.urls")), path("", include("scanpipe.urls")), path("", RedirectView.as_view(url="project/")), ] diff --git a/scanpipe/templates/scanpipe/base.html b/scanpipe/templates/scanpipe/base.html index 711a79b80..0c5b9835c 100644 --- a/scanpipe/templates/scanpipe/base.html +++ b/scanpipe/templates/scanpipe/base.html @@ -5,6 +5,7 @@ {% block title %}ScanCode.io{% endblock %} + +{% endfor %} \ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_detail_card.html b/scantext/templates/scantext/includes/license_summary_cards.html similarity index 80% rename from scantext/templates/scantext/includes/license_detail_card.html rename to scantext/templates/scantext/includes/license_summary_cards.html index a546039bd..f9df2aefe 100644 --- a/scantext/templates/scantext/includes/license_detail_card.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -27,13 +27,11 @@

{{ license.category }}

ref -

- {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} -

+

{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}

-
-
+
+
{{ license.matched_text }}
diff --git a/scantext/templates/scantext/includes/license_summary_level.html b/scantext/templates/scantext/includes/license_summary_header.html similarity index 100% rename from scantext/templates/scantext/includes/license_summary_level.html rename to scantext/templates/scantext/includes/license_summary_header.html diff --git a/scantext/templates/scantext/license_detail.html b/scantext/templates/scantext/license_detail.html deleted file mode 100644 index 2cce30c4e..000000000 --- a/scantext/templates/scantext/license_detail.html +++ /dev/null @@ -1,55 +0,0 @@ -{% extends 'scanpipe/base.html' %} -{% load static humanize %} - -{% block content %} -
- {% include 'scanpipe/includes/navbar_header.html' %} -
{% include 'scanpipe/includes/messages.html' %}
- -
-
-

License Detection Summary

- New Scan -
-
- -
- {% include 'scantext/includes/license_summary_level.html' with detected_licenses=detected_licenses %} -
- -
-
-

Input License Text

-
{% for line in text %}{{ line }}{% endfor %}
-
-
-

Detected Licenses

-
- {% include 'scantext/includes/license_detail_card.html' with detected_licenses=detected_licenses %} -
-
-
- -
-{% endblock %} - -{% block scripts %} - -{% endblock %} diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html new file mode 100644 index 000000000..52622d53e --- /dev/null +++ b/scantext/templates/scantext/license_summary.html @@ -0,0 +1,76 @@ +{% extends 'scanpipe/base.html' %} +{% load static humanize %} + +{% block content %} +
+ {% include 'scanpipe/includes/navbar_header.html' %} +
{% include 'scanpipe/includes/messages.html' %}
+ +
+
+

License Detection Summary

+ New Scan +
+
+ + +
+ +
+ + {% include 'scantext/includes/license_summary_header.html' with detected_licenses=detected_licenses %} +
+ +
+
+

Input License Text

+
{% for line in text %}{{ line }}{% endfor %}
+
+
+

Matched Text

+
+ {% include 'scantext/includes/license_summary_cards.html' with detected_licenses=detected_licenses %} +
+
+
+ + {% include 'scantext/includes/license_detail.html' with detected_licenses=detected_licenses %} +
+{% endblock %} + +{% block scripts %} + + +{% endblock %} From 2d4f7f6f5fa25ef980374a611fce577516734b4d Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Fri, 1 Jul 2022 16:27:20 +0530 Subject: [PATCH 18/59] Fixed broken short lines #450 Signed-off-by: Akhil Raj --- scantext/views.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scantext/views.py b/scantext/views.py index a59cd66b7..d9596bd9e 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -63,18 +63,14 @@ def license_scanview(request): with tempfile.NamedTemporaryFile(mode="w") as temp_file: temp_file.write(input_text) temp_file.flush() - expressions = get_licenses( - location=temp_file.name, - ) + expressions = get_licenses(location=temp_file.name) elif input_file: try: with tempfile.NamedTemporaryFile(mode="w") as temp_file: input_text = str(input_file.read(), "UTF-8") temp_file.write(input_text) temp_file.flush() - expressions = get_licenses( - location=temp_file.name, - ) + expressions = get_licenses(location=temp_file.name) except UnicodeDecodeError: message = "Please upload a valid text file." messages.warning(request, message) From bbe58d002f2001a8ecbac54b3236a0ebb2d098b6 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Fri, 1 Jul 2022 16:33:13 +0530 Subject: [PATCH 19/59] New line at the end of the page #450 Signed-off-by: Akhil Raj --- scantext/templates/scantext/includes/license_detail.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scantext/templates/scantext/includes/license_detail.html b/scantext/templates/scantext/includes/license_detail.html index 9b6935c2b..3e9280b55 100644 --- a/scantext/templates/scantext/includes/license_detail.html +++ b/scantext/templates/scantext/includes/license_detail.html @@ -45,4 +45,4 @@ } -{% endfor %} \ No newline at end of file +{% endfor %} From a19a7621301ac1b2cbab5921aa3e33e44ece2dce Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Mon, 4 Jul 2022 17:46:37 +0530 Subject: [PATCH 20/59] Ace Editor Restored with improved UI #450 Signed-off-by: Akhil Raj --- .../includes/license_summary_cards.html | 47 +++++++++++----- .../templates/scantext/license_summary.html | 53 ++++++++++++++----- scantext/views.py | 5 +- 3 files changed, 76 insertions(+), 29 deletions(-) diff --git a/scantext/templates/scantext/includes/license_summary_cards.html b/scantext/templates/scantext/includes/license_summary_cards.html index f9df2aefe..4906a2bdf 100644 --- a/scantext/templates/scantext/includes/license_summary_cards.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -1,6 +1,6 @@ {% for license in detected_licenses.licenses %}
-
+
{% if license.homepage_url %} {{ license.short_name }} {% else %} {{ license.short_name }} {% endif %} @@ -11,30 +11,49 @@

{{ license.score }}

-

- - - -

+

View

diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index 52622d53e..b19a71868 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -25,12 +25,15 @@

License Detection Summary


-
-

Input License Text

-
{% for line in text %}{{ line }}{% endfor %}
+
+
+

Input License Text

+ +
+
{{ text }}
-
-

Matched Text

+
+

Detected Licenses

{% include 'scantext/includes/license_summary_cards.html' with detected_licenses=detected_licenses %}
@@ -42,12 +45,27 @@

License Detection Summary

{% endblock %} {% block scripts %} - + {% endblock %} From 03339322b184b0df929b335e7b0240d9c98cde59 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Tue, 5 Jul 2022 19:05:38 +0530 Subject: [PATCH 22/59] Close all cards while opening a new one #450 Signed-off-by: Akhil Raj --- scantext/templates/scantext/license_summary.html | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index 272d2df10..d5061ed03 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -26,9 +26,7 @@

License Detection Summary

-
-

Input License Text

-
+

Input License Text

{{ text }}
@@ -78,6 +76,10 @@

License Detection Summary

content = card.parentElement.parentElement.parentElement.querySelector('.card-content').classList lineTag= card.parentElement.querySelector('.lines').classList if (content.contains('is-hidden')) { + cards.forEach(eachcard => { + eachcard.parentElement.querySelector('.lines').classList.remove('is-hidden') + eachcard.parentElement.parentElement.parentElement.querySelector('.card-content').classList.add('is-hidden') + }) content.remove('is-hidden') lineTag.add('is-hidden') } else { From b2061458bdfcd198ca47e06811db37ca41dea390 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Thu, 7 Jul 2022 17:50:46 +0530 Subject: [PATCH 23/59] Removed details option in the view #450 Signed-off-by: Akhil Raj --- .../scantext/includes/license_detail.html | 48 ------------------- .../templates/scantext/license_summary.html | 23 +-------- 2 files changed, 2 insertions(+), 69 deletions(-) delete mode 100644 scantext/templates/scantext/includes/license_detail.html diff --git a/scantext/templates/scantext/includes/license_detail.html b/scantext/templates/scantext/includes/license_detail.html deleted file mode 100644 index 3e9280b55..000000000 --- a/scantext/templates/scantext/includes/license_detail.html +++ /dev/null @@ -1,48 +0,0 @@ - -
-
-

Detected Licenses

-
-
-

Matched Text

-
-
- -{% for license in detected_licenses.licenses %} -
-
-
- -
-

{{ license.score }}

-

{{ license.category }}

- ref -

- {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} -

-
-
-
-
-
- -
- {{ license.matched_text }} -
-
-
-
-
- - -{% endfor %} diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index d5061ed03..9ce777d53 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -13,18 +13,11 @@

License Detection Summary

- -
- -
- +
{% include 'scantext/includes/license_summary_header.html' with detected_licenses=detected_licenses %}
-
+

Input License Text

{{ text }}
@@ -37,7 +30,6 @@

License Detection Summary

- {% include 'scantext/includes/license_detail.html' with detected_licenses=detected_licenses %}
{% endblock %} @@ -59,17 +51,6 @@

License Detection Summary

const cards = document.querySelectorAll('.card-header-icon') - document.querySelectorAll('li').forEach(li => { - li.addEventListener('click', (event) => { - if (!li.classList.contains('is-active')) { - document.querySelectorAll('li').forEach(listItem => { - listItem.classList.remove('is-active') - }) - li.classList.add('is-active') - } - }) - }) - cards.forEach(card => { card.addEventListener('click', (event) => { event.preventDefault() From b53e45cd3b8a5415eed95859cadb2b654b6c6ad4 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Sat, 16 Jul 2022 16:27:49 +0530 Subject: [PATCH 24/59] Restore details tab with matched text #450 Signed-off-by: Akhil Raj --- .../includes/license_summary_cards.html | 127 ++++++++++-------- .../includes/license_summary_detail.html | 48 +++++++ .../templates/scantext/license_summary.html | 51 ++++--- 3 files changed, 150 insertions(+), 76 deletions(-) create mode 100644 scantext/templates/scantext/includes/license_summary_detail.html diff --git a/scantext/templates/scantext/includes/license_summary_cards.html b/scantext/templates/scantext/includes/license_summary_cards.html index 421a0614f..cbeb49b84 100644 --- a/scantext/templates/scantext/includes/license_summary_cards.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -1,67 +1,76 @@ -{% for license in detected_licenses.licenses %} -
-
-
- {% if license.homepage_url %} - {{ license.short_name }} {% else %} {{ license.short_name }} {% endif %} -
-
-

- {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} -

-

+

+

Input License Text

+
{{ text }}
+
+
+

Detected Licenses

+
+ {% for license in detected_licenses.licenses %} +
+
+
+ {% if license.homepage_url %} + {{ license.short_name }} {% else %} {{ license.short_name }} {% endif %} +
+
+

+ {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} +

+

{{ license.score }}

-

- +

+ -

-
-
- -
-{% endfor %} +
\ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html new file mode 100644 index 000000000..57cddcfd9 --- /dev/null +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -0,0 +1,48 @@ + +
+
+

Detected Licenses

+
+
+

Matched Text

+
+
+ +{% for license in detected_licenses.licenses %} +
+
+
+ +
+

{{ license.score }}

+

{{ license.category }}

+ ref +

+ {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} +

+
+
+
+
+
+ +
+ {{ license.matched_text }} +
+
+
+
+
+ + +{% endfor %} \ No newline at end of file diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index 9ce777d53..c5dfc5807 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -13,23 +13,24 @@

License Detection Summary

-
+ +
+ +
+ {% include 'scantext/includes/license_summary_header.html' with detected_licenses=detected_licenses %}
-
-
-

Input License Text

-
{{ text }}
-
-
-

Detected Licenses

-
- {% include 'scantext/includes/license_summary_cards.html' with detected_licenses=detected_licenses %} -
-
-
+
+ {% include 'scantext/includes/license_summary_cards.html' with detected_licenses=detected_licenses %} +
+
{% endblock %} @@ -39,7 +40,7 @@

License Detection Summary

let editor = ace.edit("editor", { mode: "ace/mode/text", autoScrollEditorIntoView: true, - wrap: false, + wrap: true, readOnly: true, showPrintMargin: false, highlightActiveLine: false, @@ -50,6 +51,23 @@

License Detection Summary

}); const cards = document.querySelectorAll('.card-header-icon') + const tabContent = document.querySelectorAll('.tab-content') + + document.querySelectorAll('li').forEach((li, index) => { + li.addEventListener('click', (event) => { + if (!li.classList.contains('is-active')) { + document.querySelectorAll('li').forEach(listItem => { + listItem.classList.remove('is-active') + }) + li.classList.add('is-active') + } + + tabContent.forEach(section => { + section.classList.add('is-hidden') + }) + tabContent[index].classList.remove('is-hidden') + }) + }) cards.forEach(card => { card.addEventListener('click', (event) => { @@ -90,7 +108,7 @@

License Detection Summary

let end_column = 10000; let range = new Range(start_row, start_column, end_row, end_column); - editor.session.addMarker(range, "ace-marker", "fullLine"); + editor.session.addMarker(range, "ace-marker", "line"); annotations.push({ row: start_row, column: 0, @@ -109,6 +127,5 @@

License Detection Summary

session.removeMarker(value.id); } } - -{% endblock %} +{% endblock %} \ No newline at end of file From 129306658a22e45cbacfeb9a2d6903111a2f1b8f Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Thu, 28 Jul 2022 20:09:39 +0530 Subject: [PATCH 25/59] Highlight text with colors #450 * philippes code is working only for one match Signed-off-by: Akhil Raj --- .../includes/license_summary_cards.html | 6 +- .../includes/license_summary_detail.html | 2 +- .../templates/scantext/license_summary.html | 18 +- scantext/views.py | 217 +++++++++++++----- 4 files changed, 179 insertions(+), 64 deletions(-) diff --git a/scantext/templates/scantext/includes/license_summary_cards.html b/scantext/templates/scantext/includes/license_summary_cards.html index cbeb49b84..88f63587b 100644 --- a/scantext/templates/scantext/includes/license_summary_cards.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -52,9 +52,7 @@ Category - -

{{ license.category }}

- + {{ license.category }} SPDX Key @@ -73,4 +71,4 @@
-
\ No newline at end of file +
diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index 57cddcfd9..f453940bd 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -29,7 +29,7 @@
- {{ license.matched_text }} + {{ license.matched_text|safe }}
diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index c5dfc5807..176212fad 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -18,6 +18,8 @@

License Detection Summary

@@ -31,6 +33,20 @@

License Detection Summary

+ + + +
{% endblock %} @@ -128,4 +144,4 @@

License Detection Summary

} } -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/scantext/views.py b/scantext/views.py index 026d43b8f..2c2df9a09 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -21,14 +21,18 @@ # Visit https://github.com/nexB/scancode.io for support and download. import sys +import attr import tempfile from django.conf import settings from django.contrib import messages from django.shortcuts import render +from licensedcode.stopwords import STOPWORDS +from licensedcode.match import tokenize_matched_text from scantext.forms import LicenseScanForm +TRACE_HIGHLIGHTED_TEXT = True SCANCODE_REPO_URL = "https://github.com/nexB/scancode-toolkit" SCANCODE_BASE_URL = SCANCODE_REPO_URL + "/tree/develop/src/licensedcode/data/licenses" SCANCODE_LICENSE_TEXT_URL = SCANCODE_BASE_URL + "/{}.LICENSE" @@ -39,62 +43,71 @@ def license_scanview(request): - form = LicenseScanForm() - if request.method == "POST": - form = LicenseScanForm(request.POST, request.FILES) - if form.is_valid(): - input_text = form.cleaned_data["input_text"] - input_file = request.FILES.get("input_file", False) - if not len(input_text) and not input_file: - message = "Please provide some text or a text file to scan." - messages.warning(request, message) - return render( - request, - "scantext/license_scan_form.html", - { - "form": LicenseScanForm(), - }, - ) - - # The flush in tempfile is required to ensure that the content is - # written to the disk before it's read by get_licenses function - if len(input_text): - with tempfile.NamedTemporaryFile(mode="w") as temp_file: - temp_file.write(input_text) - temp_file.flush() - expressions = get_licenses(location=temp_file.name) - elif input_file: - try: - with tempfile.NamedTemporaryFile(mode="w") as temp_file: - input_text = str(input_file.read(), "UTF-8") - temp_file.write(input_text) - temp_file.flush() - expressions = get_licenses(location=temp_file.name) - except UnicodeDecodeError: - message = "Please upload a valid text file." - messages.warning(request, message) - return render( - request, - "scantext/license_scan_form.html", - { - "form": LicenseScanForm(), - }, - ) - - if not len(expressions["licenses"]): - if not len(expressions["license_expressions"]): - message = "Couldn't detect any license from the provided input." - messages.info(request, message) - return render( - request, - "scantext/license_summary.html", - { - "text": input_text, - "detected_licenses": expressions, - }, - ) + if request.method != "POST": + return render( + request, "scantext/license_scan_form.html", {"form": LicenseScanForm()} + ) + + form = LicenseScanForm(request.POST, request.FILES) + if not form.is_valid(): + return render( + request, "scantext/license_scan_form.html", {"form": LicenseScanForm()} + ) + + input_text = form.cleaned_data["input_text"] + input_file = request.FILES.get("input_file", False) + + if input_text and input_file: + message = "Provide text or a text file but not both." + messages.warning(request, message) + return render( + request, + "scantext/license_scan_form.html", + { + "form": LicenseScanForm(), + }, + ) + + if not input_text and not input_file: + message = "Provide text or a text file to scan." + messages.warning(request, message) + return render( + request, + "scantext/license_scan_form.html", + { + "form": LicenseScanForm(), + }, + ) + # The flush in tempfile is required to ensure that the content is + # written to the disk before it's read by get_licenses function + if input_text: + with tempfile.NamedTemporaryFile(mode="w") as temp_file: + temp_file.write(input_text) + temp_file.flush() + expressions = get_licenses(location=temp_file.name) + elif input_file: + try: + with tempfile.NamedTemporaryFile(mode="w") as temp_file: + input_text = str(input_file.read(), "UTF-8") + temp_file.write(input_text) + temp_file.flush() + expressions = get_licenses(location=temp_file.name) + except UnicodeDecodeError: + message = "Please upload a valid text file." + messages.warning(request, message) return render( + request, + "scantext/license_scan_form.html", + { + "form": LicenseScanForm(), + }, + ) + + if not expressions["licenses"] and not expressions["license_expressions"]: + message = "Couldn't detect any license from the provided input." + messages.info(request, message) + return render( request, "scantext/license_summary.html", { @@ -102,7 +115,18 @@ def license_scanview(request): "detected_licenses": expressions, }, ) - return render(request, "scantext/license_scan_form.html", {"form": form}) + # if TRACE_HIGHLIGHTED_TEXT: + # from pprint import pprint + # pprint(expressions, indent=4) + + return render( + request, + "scantext/license_summary.html", + { + "text": input_text, + "detected_licenses": expressions, + }, + ) def get_licenses( @@ -123,10 +147,10 @@ def get_licenses( score lower than `minimum_score` are not returned. By Default ``unknown_licenses`` is set to True to detect unknown licenses. """ - from licensedcode import cache + from licensedcode.cache import get_index from licensedcode.spans import Span - idx = cache.get_index() + idx = get_index() detected_licenses = [] detected_expressions = [] @@ -141,6 +165,8 @@ def get_licenses( qspans = [] match = None + complete_text = '' + complete_text_in_array = [] for match in matches: qspans.append(match.qspan) @@ -153,6 +179,18 @@ def get_licenses( ) ) + complete_text += get_highlighted_lines( + match=match, + stopwords=STOPWORDS, + trace=TRACE_HIGHLIGHTED_TEXT, + ) + + complete_text_in_array.append(get_highlighted_lines( + match=match, + stopwords=STOPWORDS, + trace=TRACE_HIGHLIGHTED_TEXT, + )) + percentage_of_license_text = 0 if match: # we need at least one match to compute a license_coverage @@ -167,6 +205,8 @@ def get_licenses( ("licenses", detected_licenses), ("license_expressions", detected_expressions), ("percentage_of_license_text", percentage_of_license_text), + ("complete_text_in_array", complete_text_in_array), + ("complete_text", complete_text) ] ) @@ -184,7 +224,9 @@ def _licenses_data_from_match( licenses = cache.get_licenses_db() # Returned matched_text will also include the text detected - matched_text = match.matched_text(whole_lines=False, highlight=True) + matched_text = match.matched_text(whole_lines=False, highlight=True, + highlight_matched='{}', + highlight_not_matched='{}',) detected_licenses = [] for license_key in match.rule.license_keys(): @@ -239,3 +281,62 @@ def _licenses_data_from_match( matched_rule["rule_relevance"] = match.rule.relevance return detected_licenses + +def logger_debug(*args): pass + +def get_highlighted_lines( + match, + stopwords=STOPWORDS, + trace=TRACE_HIGHLIGHTED_TEXT, +): + """ + Yield highlighted text lines (with line returns) for the whole of the matched and unmatched text of a ``query``. + """ + query = match.query + tokens = tokenize_matched_text( + location=query.location, + query_string=query.query_string, + dictionary=query.idx.dictionary, + start_line=match.query.start_line, + _cache={}, + ) + tokens = tag_matched_tokens(tokens=tokens, match_qspan=match.qspan) + + header = ''' +
''' + footer = '''
''' + + body = '' + highlight_matched = '{}' + highlight_not_matched = '{}' + for token in tokens: + val = token.value + if token.is_text and val.lower() not in stopwords: + if token.is_matched: + body += highlight_matched.format(val) + else: + body += highlight_not_matched.format(val) + else: + # we do not highlight punctuation and stopwords. + body += highlight_not_matched.format(val) + + return header + body + footer + + +def tag_matched_tokens(tokens, match_qspan): + """ + Yield Tokens from a ``tokens`` iterable of Token objects. + Known matched tokens are tagged as "is_matched=True" if they are matched. + """ + for tok in tokens: + # tagged known matched tokens (useful for highlighting) + if tok.pos != -1 and tok.is_known and tok.pos in match_qspan: + tok = attr.evolve(tok, is_matched=True) + yield tok + + From a6b34de9304930c072bf475b61c7f218f1421b55 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Thu, 28 Jul 2022 21:02:45 +0530 Subject: [PATCH 26/59] Fix failing text for `make valid` #450 Signed-off-by: Akhil Raj --- scantext/views.py | 61 ++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/scantext/views.py b/scantext/views.py index 2c2df9a09..443fcebd3 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -21,15 +21,16 @@ # Visit https://github.com/nexB/scancode.io for support and download. import sys -import attr import tempfile from django.conf import settings from django.contrib import messages from django.shortcuts import render -from licensedcode.stopwords import STOPWORDS +import attr from licensedcode.match import tokenize_matched_text +from licensedcode.stopwords import STOPWORDS + from scantext.forms import LicenseScanForm TRACE_HIGHLIGHTED_TEXT = True @@ -108,13 +109,13 @@ def license_scanview(request): message = "Couldn't detect any license from the provided input." messages.info(request, message) return render( - request, - "scantext/license_summary.html", - { - "text": input_text, - "detected_licenses": expressions, - }, - ) + request, + "scantext/license_summary.html", + { + "text": input_text, + "detected_licenses": expressions, + }, + ) # if TRACE_HIGHLIGHTED_TEXT: # from pprint import pprint # pprint(expressions, indent=4) @@ -165,7 +166,7 @@ def get_licenses( qspans = [] match = None - complete_text = '' + complete_text = "" complete_text_in_array = [] for match in matches: qspans.append(match.qspan) @@ -180,16 +181,18 @@ def get_licenses( ) complete_text += get_highlighted_lines( - match=match, - stopwords=STOPWORDS, - trace=TRACE_HIGHLIGHTED_TEXT, - ) + match=match, + stopwords=STOPWORDS, + trace=TRACE_HIGHLIGHTED_TEXT, + ) - complete_text_in_array.append(get_highlighted_lines( + complete_text_in_array.append( + get_highlighted_lines( match=match, stopwords=STOPWORDS, trace=TRACE_HIGHLIGHTED_TEXT, - )) + ) + ) percentage_of_license_text = 0 if match: @@ -206,7 +209,7 @@ def get_licenses( ("license_expressions", detected_expressions), ("percentage_of_license_text", percentage_of_license_text), ("complete_text_in_array", complete_text_in_array), - ("complete_text", complete_text) + ("complete_text", complete_text), ] ) @@ -224,9 +227,12 @@ def _licenses_data_from_match( licenses = cache.get_licenses_db() # Returned matched_text will also include the text detected - matched_text = match.matched_text(whole_lines=False, highlight=True, - highlight_matched='{}', - highlight_not_matched='{}',) + matched_text = match.matched_text( + whole_lines=False, + highlight=True, + highlight_matched="{}", + highlight_not_matched="{}", + ) detected_licenses = [] for license_key in match.rule.license_keys(): @@ -282,7 +288,10 @@ def _licenses_data_from_match( return detected_licenses -def logger_debug(*args): pass + +def logger_debug(*args): + pass + def get_highlighted_lines( match, @@ -302,16 +311,16 @@ def get_highlighted_lines( ) tokens = tag_matched_tokens(tokens=tokens, match_qspan=match.qspan) - header = ''' -
''' - footer = '''
''' +
""" + footer = """
""" - body = '' + body = "" highlight_matched = '{}' highlight_not_matched = '{}' for token in tokens: @@ -338,5 +347,3 @@ def tag_matched_tokens(tokens, match_qspan): if tok.pos != -1 and tok.is_known and tok.pos in match_qspan: tok = attr.evolve(tok, is_matched=True) yield tok - - From 1bdf9d19094f27887110cd1aedb5bb341e96b6ac Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Thu, 28 Jul 2022 21:14:14 +0530 Subject: [PATCH 27/59] Fix failing test for `make valid` #450 Signed-off-by: Akhil Raj --- scantext/views.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/scantext/views.py b/scantext/views.py index 443fcebd3..2e1ffe744 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -299,7 +299,8 @@ def get_highlighted_lines( trace=TRACE_HIGHLIGHTED_TEXT, ): """ - Yield highlighted text lines (with line returns) for the whole of the matched and unmatched text of a ``query``. + Yield highlighted text lines (with line returns) for the whole + of the matched and unmatched text of a ``query``. """ query = match.query tokens = tokenize_matched_text( @@ -312,12 +313,26 @@ def get_highlighted_lines( tokens = tag_matched_tokens(tokens=tokens, match_qspan=match.qspan) header = """ -
""" +
+ """ footer = """
""" body = "" From 67462c5a957ff87813268dcbf109ccd2edd99c96 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Sun, 7 Aug 2022 12:19:19 +0530 Subject: [PATCH 28/59] Testing out new UI to match the projects page #450 Signed-off-by: Akhil Raj --- .../includes/license_summary_detail.html | 87 ++++++++++--------- .../templates/scantext/license_summary.html | 30 ++----- scantext/views.py | 8 -- 3 files changed, 52 insertions(+), 73 deletions(-) diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index f453940bd..25990c3a5 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -1,48 +1,51 @@ - -
-
-

Detected Licenses

-
-
-

Matched Text

-
-
- -{% for license in detected_licenses.licenses %}
-
-
- -
-

{{ license.score }}

-

{{ license.category }}

- ref -

- {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} -

-
-
-
-
- -
- {{ license.matched_text|safe }} +

Input License Text

+ {% for text in detected_licenses.complete_text_in_array %} +
+ {{ text|safe }} +
+ {% endfor %} +
+
+
+
+

Detected Licenses

+
+ + +
+ {% for license in detected_licenses.licenses %} + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Name{{ license.name }}
Score{{ license.score }}
Owner{{ license.owner }}
Category{{ license.category }}
SPDX Key{{ license.spdx_license_key }}
Reference{{ license.reference_url }}
+ {% endfor %}
-
- - -{% endfor %} \ No newline at end of file diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index 176212fad..f95a7a96f 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -15,38 +15,22 @@

License Detection Summary

{% include 'scantext/includes/license_summary_header.html' with detected_licenses=detected_licenses %}
-
+ -
{% endblock %} @@ -67,9 +51,9 @@

License Detection Summary

}); const cards = document.querySelectorAll('.card-header-icon') - const tabContent = document.querySelectorAll('.tab-content') + const tabContent = document.querySelectorAll('.tab-container') - document.querySelectorAll('li').forEach((li, index) => { + document.querySelectorAll('.nav li').forEach((li, index) => { li.addEventListener('click', (event) => { if (!li.classList.contains('is-active')) { document.querySelectorAll('li').forEach(listItem => { diff --git a/scantext/views.py b/scantext/views.py index 2e1ffe744..5dcb703d2 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -166,7 +166,6 @@ def get_licenses( qspans = [] match = None - complete_text = "" complete_text_in_array = [] for match in matches: qspans.append(match.qspan) @@ -180,12 +179,6 @@ def get_licenses( ) ) - complete_text += get_highlighted_lines( - match=match, - stopwords=STOPWORDS, - trace=TRACE_HIGHLIGHTED_TEXT, - ) - complete_text_in_array.append( get_highlighted_lines( match=match, @@ -209,7 +202,6 @@ def get_licenses( ("license_expressions", detected_expressions), ("percentage_of_license_text", percentage_of_license_text), ("complete_text_in_array", complete_text_in_array), - ("complete_text", complete_text), ] ) From 0f675ffaeef4d43a8bd59185f3c9de891943d06a Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Tue, 9 Aug 2022 15:25:15 +0530 Subject: [PATCH 29/59] Add match_text tests to scantext #450 Signed-off-by: Akhil Raj --- scantext/match_text.py | 501 ++++++ .../tests/data/matched_text/binary_text/gosu | Bin 0 -> 1712 bytes .../binary_text/rules/gpl-3.0_rdesc_1.RULE | 1 + .../binary_text/rules/gpl-3.0_rdesc_1.yml | 2 + .../tests/data/matched_text/ffmpeg/ffmpeg | Bin 0 -> 6136 bytes .../tests/data/matched_text/ffmpeg/ffmpeg.exe | Bin 0 -> 16136 bytes .../data/matched_text/ffmpeg/libavsample.lib | Bin 0 -> 1783 bytes .../index/rules/gpl-2.0_bare_single_word.RULE | 1 + .../index/rules/gpl-2.0_bare_single_word.yml | 3 + .../index/rules/gpl-2.0_or_apache-2.0_2.RULE | 3 + .../index/rules/gpl-2.0_or_apache-2.0_2.yml | 6 + .../matched_text/index/rules/mit_101.RULE | 2 + .../data/matched_text/index/rules/mit_101.yml | 5 + scantext/tests/data/matched_text/query.txt | 4 + .../tests/data/matched_text/spdx/query.txt | 12 + .../tokenize_matched_text_query.txt | 1 + .../data/matched_text/turkish_unicode/query | 20 + .../turkish_unicode/rules/rule1.RULE | 1 + .../turkish_unicode/rules/rule1.yml | 1 + .../turkish_unicode/rules/rule2.RULE | 2 + .../turkish_unicode/rules/rule2.yml | 1 + .../turkish_unicode/rules/rule3.RULE | 1 + .../turkish_unicode/rules/rule3.yml | 1 + .../turkish_unicode/rules/rule4.RULE | 1 + .../turkish_unicode/rules/rule4.yml | 1 + .../data/matched_text/unicode_text/main3.js | 1 + scantext/tests/test.py | 25 - scantext/tests/test_match_text.py | 1495 +++++++++++++++++ 28 files changed, 2066 insertions(+), 25 deletions(-) create mode 100644 scantext/match_text.py create mode 100644 scantext/tests/data/matched_text/binary_text/gosu create mode 100644 scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE create mode 100644 scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml create mode 100644 scantext/tests/data/matched_text/ffmpeg/ffmpeg create mode 100644 scantext/tests/data/matched_text/ffmpeg/ffmpeg.exe create mode 100644 scantext/tests/data/matched_text/ffmpeg/libavsample.lib create mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE create mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml create mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE create mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml create mode 100644 scantext/tests/data/matched_text/index/rules/mit_101.RULE create mode 100644 scantext/tests/data/matched_text/index/rules/mit_101.yml create mode 100644 scantext/tests/data/matched_text/query.txt create mode 100644 scantext/tests/data/matched_text/spdx/query.txt create mode 100644 scantext/tests/data/matched_text/tokenize_matched_text_query.txt create mode 100644 scantext/tests/data/matched_text/turkish_unicode/query create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE create mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml create mode 100644 scantext/tests/data/matched_text/unicode_text/main3.js delete mode 100644 scantext/tests/test.py create mode 100644 scantext/tests/test_match_text.py diff --git a/scantext/match_text.py b/scantext/match_text.py new file mode 100644 index 000000000..e43a60ca2 --- /dev/null +++ b/scantext/match_text.py @@ -0,0 +1,501 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from enum import IntEnum +from itertools import groupby + +import attr +from attr import validators +from licensedcode import query +from licensedcode.spans import Span +from licensedcode.stopwords import STOPWORDS +from licensedcode.tokenize import index_tokenizer +from licensedcode.tokenize import matched_query_text_tokenizer + +TRACE = False +TRACE_MATCHED_TEXT = False +TRACE_MATCHED_TEXT_DETAILS = False + + +def logger_debug(*args): + pass + + +if TRACE or TRACE_MATCHED_TEXT or TRACE_MATCHED_TEXT_DETAILS: + + use_print = True + if use_print: + prn = print + else: + import logging + import sys + + logger = logging.getLogger(__name__) + # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) + prn = logger.debug + + def logger_debug(*args): + return prn(" ".join(isinstance(a, str) and a or repr(a) for a in args)) + + def _debug_print_matched_query_text(match, extras=5): + """ + Print a matched query text including `extras` tokens before and after + the match. Used for debugging license matches. + """ + # Create a fake new match with extra tokens before and after + new_match = match.combine(match) + new_qstart = max([0, match.qstart - extras]) + new_qend = min([match.qend + extras, len(match.query.tokens)]) + new_qspan = Span(new_qstart, new_qend) + new_match.qspan = new_qspan + + logger_debug(new_match) + logger_debug(" MATCHED QUERY TEXT with extras") + qt = new_match.matched_text(whole_lines=False) + logger_debug(qt) + + +@attr.s(slots=True, frozen=True) +class Token(object): + """ + Used to represent a token in collected query-side matched texts and SPDX + identifiers. + """ + + # original text value for this token. + value = attr.ib() + # line number, one-based + line_num = attr.ib() + # absolute position for known tokens, zero-based. -1 for unknown tokens + pos = attr.ib(default=-1) + # True if text/alpha False if this is punctuation or spaces + is_text = attr.ib(default=False) + # True if part of a match + is_matched = attr.ib(default=False) + # True if this is a known token + is_known = attr.ib(default=False) + + +def tokenize_matched_text( + location, + query_string, + dictionary, + start_line=1, + _cache={}, +): + """ + Return a list of Token objects with pos and line number collected from the + file at `location` or the `query_string` string. `dictionary` is the index + mapping a token string to a token id. + + NOTE: the _cache={} arg IS A GLOBAL mutable by design. + """ + key = location, query_string, start_line + cached = _cache.get(key) + if cached: + return cached + # we only cache the last call + _cache.clear() + _cache[key] = result = list( + _tokenize_matched_text( + location=location, + query_string=query_string, + dictionary=dictionary, + start_line=start_line, + ) + ) + return result + + +def _tokenize_matched_text( + location, + query_string, + dictionary, + start_line=1, + trace=TRACE_MATCHED_TEXT_DETAILS, +): + """ + Yield Token objects with pos and line number collected from the file at + `location` or the `query_string` string. `dictionary` is the index mapping + of tokens to token ids. + """ + pos = 0 + qls = query.query_lines( + location=location, + query_string=query_string, + strip=False, + start_line=start_line, + ) + for line_num, line in qls: + if trace: + logger_debug( + " _tokenize_matched_text:", "line_num:", line_num, "line:", line + ) + + for is_text, token_str in matched_query_text_tokenizer(line): + if trace: + logger_debug(" is_text:", is_text, "token_str:", repr(token_str)) + + # Determine if a token is is_known in the license index or not. This + # is essential as we need to realign the query-time tokenization + # with the full text to report proper matches. + if is_text and token_str and token_str.strip(): + + # we retokenize using the query tokenizer: + # 1. to lookup for is_known tokens in the index dictionary + + # 2. to ensure the number of tokens is the same in both + # tokenizers (though, of course, the case will differ as the + # regular query tokenizer ignores case and punctuations). + + # NOTE: we have a rare Unicode bug/issue because of some Unicode + # codepoint such as some Turkish characters that decompose to + # char + punct when casefolded. This should be fixed in Unicode + # release 14 and up and likely implemented in Python 3.10 and up + # See https://github.com/nexB/scancode-toolkit/issues/1872 + # See also: https://bugs.python.org/issue34723#msg359514 + qtokenized = list(index_tokenizer(token_str)) + if not qtokenized: + + yield Token( + value=token_str, + line_num=line_num, + is_text=is_text, + is_known=False, + pos=-1, + ) + + elif len(qtokenized) == 1: + is_known = qtokenized[0] in dictionary + if is_known: + p = pos + pos += 1 + else: + p = -1 + + yield Token( + value=token_str, + line_num=line_num, + is_text=is_text, + is_known=is_known, + pos=p, + ) + else: + # we have two or more tokens from the original query mapped + # to a single matched text tokenizer token. + for qtoken in qtokenized: + is_known = qtoken in dictionary + if is_known: + p = pos + pos += 1 + else: + p = -1 + + yield Token( + value=qtoken, + line_num=line_num, + is_text=is_text, + is_known=is_known, + pos=p, + ) + else: + + yield Token( + value=token_str, + line_num=line_num, + is_text=False, + is_known=False, + pos=-1, + ) + + +def reportable_tokens( + tokens, + match_qspan, + start_line, + end_line, + whole_lines=False, + trace=TRACE_MATCHED_TEXT_DETAILS, +): + """ + Yield Tokens from a ``tokens`` iterable of Token objects (built from a query- + side scanned file or string) that are inside a ``match_qspan`` matched Span + starting at `start_line` and ending at ``end_line``. If whole_lines is True, + also yield unmatched Tokens that are before and after the match and on the + first and last line of a match (unless the lines are very long text lines or + the match is from binary content.) + + As a side effect, known matched tokens are tagged as "is_matched=True" if + they are matched. + + If ``whole_lines`` is True, any token within matched lines range is + included. Otherwise, a token is included if its position is within the + matched ``match_qspan`` or it is a punctuation token immediately after the + matched ``match_qspan`` even though not matched. + """ + start = match_qspan.start + end = match_qspan.end + + started = False + finished = False + + end_pos = 0 + last_pos = 0 + for real_pos, tok in enumerate(tokens): + if trace: + logger_debug("reportable_tokens: processing", real_pos, tok) + + # ignore tokens outside the matched lines range + if tok.line_num < start_line: + if trace: + logger_debug( + " tok.line_num < start_line:", tok.line_num, "<", start_line + ) + + continue + + if tok.line_num > end_line: + if trace: + logger_debug(" tok.line_num > end_line", tok.line_num, ">", end_line) + + break + + if trace: + logger_debug("reportable_tokens:", real_pos, tok) + + is_included = False + + # tagged known matched tokens (useful for highlighting) + if tok.pos != -1 and tok.is_known and tok.pos in match_qspan: + tok = attr.evolve(tok, is_matched=True) + is_included = True + if trace: + logger_debug(" tok.is_matched = True", "match_qspan:", match_qspan) + else: + if trace: + logger_debug( + " unmatched token: tok.is_matched = False", + "match_qspan:", + match_qspan, + "tok.pos in match_qspan:", + tok.pos in match_qspan, + ) + + if whole_lines: + # we only work on matched lines so no need to test further + # if start_line <= tok.line_num <= end_line. + if trace: + logger_debug(" whole_lines") + + is_included = True + + else: + # Are we in the match_qspan range or a punctuation right before or after + # that range? + + # start + if not started and tok.pos == start: + started = True + if trace: + logger_debug(" start") + + is_included = True + + # middle + if started and not finished: + if trace: + logger_debug(" middle") + + is_included = True + + if tok.pos == end: + if trace: + logger_debug(" at end") + + finished = True + started = False + end_pos = real_pos + + # one punctuation token after a match + if finished and not started and end_pos and last_pos == end_pos: + end_pos = 0 + if not tok.is_text: + # strip the trailing spaces of the last token + if tok.value.strip(): + if trace: + logger_debug(" end yield") + + is_included = True + + last_pos = real_pos + if is_included: + yield tok + + +def get_full_matched_text( + match, + location=None, + query_string=None, + idx=None, + whole_lines=False, + highlight=True, + highlight_matched="{}", + highlight_not_matched="[{}]", + only_matched=False, + stopwords=STOPWORDS, + _usecache=True, + trace=TRACE_MATCHED_TEXT, +): + """ + Yield strings corresponding to the full matched query text given a ``match`` + LicenseMatch detected with an `idx` LicenseIndex in a query file at + ``location`` or a ``query_string``. + + See get_full_qspan_matched_text() for other arguments documentation + """ + if trace: + logger_debug("get_full_matched_text: match:", match) + + return get_full_qspan_matched_text( + match_qspan=match.qspan, + match_query_start_line=match.query.start_line, + match_start_line=match.start_line, + match_end_line=match.end_line, + location=location, + query_string=query_string, + idx=idx, + whole_lines=whole_lines, + highlight=highlight, + highlight_matched=highlight_matched, + highlight_not_matched=highlight_not_matched, + only_matched=only_matched, + stopwords=stopwords, + _usecache=_usecache, + trace=trace, + ) + + +def get_full_qspan_matched_text( + match_qspan, + match_query_start_line, + match_start_line, + match_end_line, + location=None, + query_string=None, + idx=None, + whole_lines=False, + highlight=True, + highlight_matched="{}", + highlight_not_matched="[{}]", + only_matched=False, + stopwords=STOPWORDS, + _usecache=True, + trace=TRACE_MATCHED_TEXT, +): + """ + Yield strings corresponding to words of the matched query text given a + ``match_qspan`` LicenseMatch qspan Span detected with an `idx` LicenseIndex + in a query file at ``location`` or a ``query_string``. + + - ``match_query_start_line`` is the match query.start_line + - ``match_start_line`` is the match start_line + - ``match_end_line`` is the match= end_line + + The returned strings contains the full text including punctuations and + spaces that are not participating in the match proper including punctuations. + + If ``whole_lines`` is True, the unmatched part at the start of the first + matched line and the unmatched part at the end of the last matched lines are + also included in the returned text (unless the line is very long). + + If ``highlight`` is True, each token is formatted for "highlighting" and + emphasis with the ``highlight_matched`` format string for matched tokens or to + the ``highlight_not_matched`` for tokens not matched. The default is to + enclose an unmatched token sequence in [] square brackets. Punctuation is + not highlighted. + + if ``only_matched`` is True, only matched tokens are returned and + ``whole_lines`` and ``highlight`` are ignored. Unmatched words are replaced + by a "dot". + + If ``_usecache`` is True, the tokenized text is cached for efficiency. + """ + if trace: + logger_debug("get_full_qspan_matched_text: match_qspan:", match_qspan) + logger_debug("get_full_qspan_matched_text: location:", location) + logger_debug("get_full_qspan_matched_text: query_string :", query_string) + + assert location or query_string + assert idx + + if only_matched: + # use highlighting to skip the reporting of unmatched entirely + whole_lines = False + highlight = True + highlight_matched = "{}" + highlight_not_matched = "." + highlight = True + + # Create and process a stream of Tokens + if not _usecache: + # for testing only, reset cache on each call + tokens = tokenize_matched_text( + location=location, + query_string=query_string, + dictionary=idx.dictionary, + start_line=match_query_start_line, + _cache={}, + ) + else: + tokens = tokenize_matched_text( + location=location, + query_string=query_string, + dictionary=idx.dictionary, + start_line=match_query_start_line, + ) + + if trace: + tokens = list(tokens) + print() + logger_debug("get_full_qspan_matched_text: tokens:") + for t in tokens: + print(" ", t) + print() + + tokens = reportable_tokens( + tokens=tokens, + match_qspan=match_qspan, + start_line=match_start_line, + end_line=match_end_line, + whole_lines=whole_lines, + ) + + if trace: + tokens = list(tokens) + logger_debug("get_full_qspan_matched_text: reportable_tokens:") + for t in tokens: + print(t) + print() + + # Finally yield strings with eventual highlightings + for token in tokens: + val = token.value + if not highlight: + yield val + else: + if token.is_text and val.lower() not in stopwords: + if token.is_matched: + yield highlight_matched.format(val) + else: + yield highlight_not_matched.format(val) + else: + # we do not highlight punctuation and stopwords. + yield val diff --git a/scantext/tests/data/matched_text/binary_text/gosu b/scantext/tests/data/matched_text/binary_text/gosu new file mode 100644 index 0000000000000000000000000000000000000000..61b925fe68c8f0f791a440e370e9b61dfc0684ae GIT binary patch literal 1712 zcmbVM%WD%s7@t108fhC8@j6gwRcJHsnJrRsX)Rc=BG!WlO_OaBOtYoC@j)rgv3S*E z4<35;;HmB1gU2GG$Mz!hk5F)CXJ*|5q2R#I?|c2e@0-WY?)=ii;J^UzV-U;%qn+%| zr>3%VeEAy!%0Ph;J&%DQa9R6@_2=WtPV%_tcpgE~CNv!L?T=f1f4rOu9&6SNJepWD zOu9>{%m!%K4%T?55s%X8ShcO%t3=6Pm~V0psEFIRi8yJ8o(7Ytdu zOaJI+QcHQ(df|Mn+5XxDbMHsICq>w!0npXD+rb`B`QTQnzwNQwT6xn`t>o*U9T*D$XO3UHo~qdd zGsy}S4SD|Jq2(_g+WH@iGdKG8{e|}B`R7NLe|}{7Ich5Qr30NlJ~*~~aBTVAch`DS zz#a{NDaX(DFuSJ{X=H!AJJu|VIUUoJ=R?Pb_TE#(5Jv(@q)>nn#+YD=8Rl4Ei4_hA zBA5_D2_u{cB8egaMHEv)DP@#XK_yi*V2EKx7-fucCYWT31srkA38$QK&IOlT@jxJf z1rby*!G#b~C=o~`u_Tg8Cb<++N+kn@6jnr0#S~XUNu^W}=oSNAKG3NHO$)TC@FcF( z!&$h!4QC#Qt#!Dw13PiJH63q;RalKSHY)8Jd{)`2$IlB8!m!>On=~u!s14UDal^RH z$fs&YYf{b*dX$+jM!l_q{sIt+7lasJnoB9_EBZOwL_7)2HWG}<5xEZx=11nb1 z{r1GPn$@r!>jm9kUYe%x_Ijt)f=T!~ft3U{l4LWUEtTrcq|sTM(W@)j{!8^J?%XK= zu=>^fumEfxFIfEbAjOkUODV!{RzLe;vH0ETAFaRW;x-+gtXjLj*8Yk1&w(zP4u8Ey Woj&H!KRf=B@X!5K=P!S@{enMJslW9A literal 0 HcmV?d00001 diff --git a/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE b/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE new file mode 100644 index 000000000..3c0984a8e --- /dev/null +++ b/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE @@ -0,0 +1 @@ +License: GPL-3 diff --git a/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml b/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml new file mode 100644 index 000000000..8f2188c97 --- /dev/null +++ b/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml @@ -0,0 +1,2 @@ +license_expression: gpl-3.0 +is_license_tag: yes diff --git a/scantext/tests/data/matched_text/ffmpeg/ffmpeg b/scantext/tests/data/matched_text/ffmpeg/ffmpeg new file mode 100644 index 0000000000000000000000000000000000000000..c06345a809dba9dab8c95e37505411701fdd65c7 GIT binary patch literal 6136 zcmeHL-EQN!71m6D(E1g+7sM9sWU!HJXR<+d23d@g*fRza+pwM44vHe6B+6z-5>-)h z?7^b@4t8%_p;w1rP#`5+7}oFvBbl}!^1=Jd>{G4;A;5v$rGo3o;u$; zRQmTnUK^49b)190|H72%=e2X{{1(4oIKOj#jqsb0Wx9U<#?(3vS^Fja zPoG-B>C@Ndqkey3#puI6wTJYk{&vsn>eIge`Of-Tza58}dTK|${utQ$AA`Wi>-VK= ziq20R`>xwL49|s15|uKsDMYHJO7%znjLs_xRd{#jc;gLG)M z+uHgvO0%-im1FSZ_wA7SWe08j+-i__sY(Sq(5lP>QQ50%CuPwzQ0vO{!%nl~MNyK8 zr868}PX}*qFZ+xBx`PO0;bUh!OXDy_C|>rPv{h+q_DxzM6;D%noZ&vU+I zg;LCyOAPEnm2rsP3&!JE1jcnF_R?2_90-UIXTNp2HGe1S?qL+RpT+%)>^^*O7FVvGRFbt(|^ZC^r;^ z2nyJ+k<~0xv{o=1&IM0tD6!NnCnNi9LIGL$4G9@DB8?* zu~2jFu0>9Z^{iX^Wm=S7uZJqXlyT^4gb(Vjof3!<={Q(%nVw;)LXpC4*Nvqg%G?<3 z;JdRqJ-IzOhBz%Z?lLW*BGcer1>bQQipSJ~;4BEB9&9#e%dft|mq@uE3wN2tyMkFJ z)Nv@asnta;vy3KaC&S*s{>jhakjg~*7c9KTYjv>7m7wXx{<-gvud(7dg?V2l^)@Y5 zI|&9?#Clhbd8jkKSC&54`}NZIZDOlRG7tO(|D4-h$-Rhx7!Q$&I^^5%_wAL ztefrfBFP$t9-kKsNw2Ueo2n_#vSv)R+U(6HwaSFpG+XJtnS=(my~>+$FYeUO2Dnza zFPo!TXHCm@e4X*&L!-Al;Wx*(IXd~ep|L^QZwNlF{k=J&!1BFaK;`9LdDH7HbKYD? zVbFW=&2hu16v0XF^z=nT_%ZqGu8bukyUpOi*QOQfJXqHdIOH-$CMXokc#e4;4rhbu zWVT=;UCUghl&RR7=aP0SAq#RAJ-`j7B9@3mvRHP?4K;F$FgZYDRkQF7m|?r@Z@d5moqtEHVCnsJX!mDUmwWJah>Skp4` zfsC*QD9)7oZ4~pRW`AG@-OX_q5Br{VH>dRV+yV8=wz&!WLGYZlvF5NJ;EtU+7Sd9{ zqRq99*Ag)IwRh}w*};uLU0ZTbEC%=f);saK$6oK~IXmk1PP!-EZ%o6swmBXcLi6_$ zyY@4rTN1Db_>V)!XGs4H=|K6F2JA`(01#lKW*JE|9n$=5 z%cNxLeO~p>h6+|qGJVAaun`?p2O@S&^I9!C z%-v{0Cc!TN2PF5$4f;%h@Ngj~ZRj&%Ym z{(ont#bX7)H*BP>UojaGt{*ZYV{k~BXbtygsk1D%?Ch5*(H&@O96UfC`ipLz|;n) zz}?LQvFp-31K@^O;$_VOFX_DbK+_vkd~lFHQt1)HtofzoNyH9`XF_UN=1xj%6VjS% z?kc>IsTid=%41^@?^G!WmEm1v&-1RR^-^|&#e?6ePy!siR-(F0v zr<3tuyqL4|(PDl(9n5a0qdyL=T4w%j1lmFU$B^Yubk)8c)`sYf%wNZAl!g+=*%BGK zI^-Z~4?4db%r9o6>0&e)*Ug;PyujpR+;y}2v(e968^`CPMtgFjzD3p5xy){b>LRsT zr=+KwE3jU1f(AkL^n}$aItT?$oc0dTW?*|2NcW@kELH?-_EvQT(5Y6hSqMeF1+dsv zxm=ml_RL<6JgZ=&K1tQ*lWOkjo7}wi{^9**b#$f-T{WrvstjIB>~-W>o-fPfUv}{S W`N6#H{q~dk!ra!GJN@d@y#5O*7h!Dx literal 0 HcmV?d00001 diff --git a/scantext/tests/data/matched_text/ffmpeg/ffmpeg.exe b/scantext/tests/data/matched_text/ffmpeg/ffmpeg.exe new file mode 100644 index 0000000000000000000000000000000000000000..5a9b37bdb1d6e965201cbd5d028ae82b9333203f GIT binary patch literal 16136 zcmeI3OK%)kcE>Me1BrX$L4bG>WRT32koJ%cyGcp5<4_jIG%3mi^dLyemZK4*>Z;q_ zWz|dd=xz$eAd9R51n?psAxN^w#=syLWHn!4fP8~EK$da7K#(ZEb8l5YD2-^(x|{5- zs{4Na&;Oiz>h@oLrCw7?z0UXfbESUCRX!it-&g!)8|XlnaL zZM^n_l?y*o|M!n8>e7!csK5C;_03zv{yty*Pb;={a!(FvHv#s_MX7(I90LFLFaG$# zYw9DSw96-%B@dta;vODywW2q`1{w@PF1Fzz?8-> zO`wY~c~a8R|bL44-9h!^6+s5jSLie<}XQ8idd6lYf>?@U2y#z3XH>3DIT zEVgVuYW!l<@XGt|E)*~6zsSsDooQArj!9m|*+RA=@v>}jkcxCNH}3SN$c&krT|&yC zN%O^lE63>z-5Z5LQF@DwAEnaSac^OTP~<4hdm!0#(MeFYSnJpuWuE_JA46>G0J5asQulTBdK&ULMK@++q{ARG%HKA~Q0P zI!H!dL^E@frFp3(M%$Q{x|cjj(s81nm{a|ecJULz{ibl)-KxF$ZqVMmV!xf{My~2^ zRqC{_V-u(OsqR%_6okoOp^}6(+fw8C_RCUh(p+j6mt13kASOE({1ZT;PFt7#v8Y6a zTT-k2X&t$ajPtN8DUM;0qCmm4iUKWC%!{Jbr&0aB$$`fk?34f`QhTPp*;ie4tnRCi z)!t%HES3rM>QB4!(bx5-VPMjh-f5n9svzX~xOqOVdSzHfrlq_4b(Vc3s_EE4)^QaP zJ(}rW~e$zT9F($_Lh&8AW(L&I{6D$}GGds3br4?}+_osl^jGbk*#c3*e+ zvEx21KTJw6%1Oal@f5*aHfb$PWop$H)pBLIz zU@LJ8rdcnF%=|xS7Y`A7n$Jy>nB0qCO_|pVqp%E3v89zAe;IzQg8Z^pC0xqq-b6?f zRm7}IE>*8D>!Z+$N!7|9iU8A!z3s*R&`SnJt7bP-hFv-&PSV8K%1@fbbhdh1td{Bv zSzF3JD|hcfGi>g9Ik4AkG_stgGwHzwKt$J?^+JLyoR~-kD)vg7h9$9(pzwr-=%$R9 z#Z7aweik!JR};!-Y|=`~7lCHCDNEkn+)1@tfdVDX{8`4sW|xiOK&#LW#z2Do{TNBM zZWm3G%B;yM3W0>&Xm13N3PyJ8bUF?P!%`=P&XQTkFIC5xr9-knwHE|TZkjOHP(YK* z4WOL^wx$4D&a^`owiGc)!e^A(JFP1s%5nRk<&qQf5D5ym`zate%rF9iegWc>kgV?rnbzJ$r zDT;m-(KaheA-yk*0i_0}R}EC(U6eYa#fw$QVo6yZZgt9zi>nEU$XYhJ<=$f<;6nwA z8OzU59v{&cwvs$Iw`$LVGVQ7CloaE__vmLJP%qTE*l?Me8EW9OaHwvAbO#%L?J6G#&AW ztOE<}G85;oT}ic=&q{&WT9?vEG@quPR$H%KiHj3k=U6Ov=hJ%i+FUQs;6_6h!Q9Ng zBK`Mh(3yNo0Uk&4fK{^g1 z{HUuBQ=kR(bL_9MJJv~ZSusd9N28urp(&9@i4aos)JPdwIcn_X35`}^(oYrpwU-tK z3Fjebnu=&=mUgx+jn!}K)y>Jfn|$<_dUNuD+^wmRay(QuJQD=9>kU{<42}316y#?U z*xp19VP3*!khRl3#opwRqj$DbcHMQ@^n$!|O0gS9aOCc>5Lt(OwKmGL#T#~)zS_Og zy&`K@@21&lZpFc>zlJ#Mw2=vH2PaWnxi?ii4yzsnU*DqT2~r;soJok4553ctg)OQ5xUOCAi+X1D zo@?5`YpI|HLw&WWpe}74Sw~@ss&!bH(H{=WGTYkN7>~zYOqAU;A8d%)#*7vl+YklhrMiLOM`F9>ijKtE zYUw@E{Pzr$Q%J;=38M4gAma85!A`TsQj2PT`90BPTd20O?kVv=-4gj{^(nD??1^o` zRlsI>p@KTnnpDn4cEx5w3#`S2A!1L99b%^|6{!FEgKt;V?<)0G|0DI^a?j^~>M&h= zr2d!t-+AlXl{eq|*4n*L{%Z1PwZFu8xz%cfT!|7VYKA58Hq>qK$&GX<$EL=7yFHyOZVUyP7r@7z9YR!X5i#@0oQYvBm27IUcXdLrlv6H1 zvM$kPo3Cy@yeWGwtflUN6k@h>l}M7>d?5xUSOFRV%Db5LkqfKsU=85*!`!bT;HoYO zOBa%{Ipb?7)aMOBzZXBA=c~g!MO<2~wZMvWy)7y}hZm+SxStjSir0zc7>F2mn&4L|4Lujp)ZMhHDIUR-Q%DmvJ+C~re#7P8Eq5Yluo6a%ic_DC{k z>~&d5*s^&UlUIb)2ahc!M^frwkYK*1Kn57GpCc&~MmUT;4jAfE>@h)!ZKO6Cf0sO% zyQeu$#2^a~q82x#BEw~rrEAEGb`;TaE4dzFtuBq6yJEVljj(D8k4s509ST<5QjuFK zat()ixgrN%%pudclHwvJ($4moeW>4lTe}EHhi5x<76THMsh>u#jO#_fsomMLlhvR? z*8m!fwy_)2-QQCC7EP^%2}GJk)MQlcl%;1ap;_pc6;gpjQ_#~xl(Votffb-x*I|uw z;5B{B&C|+Y4|Wjx;dQ+!Qeh9lk^!DTdtPO7Fj~XaYF+Ym0Bfx^wTjiDA*$oVf-hyv z`UaZIfv9xORpVrVJsO#|bXM5Cll{&IqRTOraCjiRKHuKiPCGd=+}(Br7ogsmy^ymr*!tvq_gene1s_KY>WGg{_;x>WL}Yo49J+osLGvEJJ%ID zwrh`mHYe>#V+^nLPBHAWw2BfLj1}=gJ9sFAIYp`^XitY?3EE3~|DEZ*#o==?dtEm> zRpG-@rWy1%hdpsnR;F?NAi)k)Y@}W-#vB!zU`^=a0FzsRfu+gv@`TQsk8^^!C9Df^ z%ME@Wpbr#O4%XY^xd1!f;M4(UlKWzzS0w9YK5c9l_GhDA@NaL44<>C6zoGA3*OY#6 zjPQRUW*@ALePSPVo+st4~N&#Y8CUy?{A)NKSpWr}& zwnw+ZKy}kIph@VdsKD;LSN-bw^XLD*11{YZpc21fcG2woV`FxUA2DFl+0G&_!MuS- zahNnu=U5hOzFt-|nYeXj`9p|Zg)>)OTJtPznbMUdbDIvulDRFJ+jnGcvyFQ$oTYxG z(xT5dS0VQy_aXPCT=8Q9k5t8Xrc{cnnS(BI60%AxM97&*YrjCJrbDDePpg$+OHyX( zfd%meJ1@f&l&p8uMq}6%g?iufODxRWD9*2bsbNz&bL^F4vG+;(I865KsqZwzrIf@!G~$F_8@$yw?(5r8~CB7L!YS^ z!v9Qvp*YI`9IQH`&s|;Z_`P6~%+CYu$%^@%Wcl`x*zlDZg1#z3EUasos+6eMhS2eBaKc zc{Pu6si!pR;wATnm~pR^+5^!`?9JIn-gT81kDIT=$UX1$E_}tQN5Yz`eZ{!(ffE)e zZyt#URUfU^9^_k(EZha|kHS6YGNf05&F&@#>G)sau-^EvO&!g^DLGZ)xOPBUVj2^j zmdB9ypbGL}2o&hy6hPCK-{boYDz03ATtELu{ro39|A4Pt`K+w0*rvOu@BMp#x#I`i=MbM%+v-VAxqgeDHceH(gXTW+3-CR7gPjNtluDMh2#vUq^&bg$` ztbv`;$O;pD)ZP)FGAYk$`g8AT;AWktkXCs^0N@rM!=ad36CA%wy!PbXIBbqBjb(sS z8fzn%wlPiCt-~TGJ>-%>minI1K=;01aaxon?vnY0u6cEEhgXS8IbxDG%EjBw<;C+a z(6>!2u^+roo>g&qqa+v5rPei4@3>*JXZ4)Q%j-L-e?!xSC zbl0lTn>IqNj|iP%gwDVG7irzTK&{F5f0K6>wJtvVR_-E%KDQ8BLPvKHnt)tB??G8Y zx8PlAW$5c1xx9d!FTPDog)VtN&Cw<2*&len&5;kcqC>5vi*ja;C($?)GWM2p7bQt* z_#|?cMFZOF(P2V1*ZTdzaAW;mca=#_RV2mO%`A5`HIhfQy2{XwbKH8N74A5kP$7AY zsRv`L6Sol|#_~iI+YxK5ZzwyaldIK_is69Fgo^dd5$CBFjHEHidxJhE#6#h1N`=}^ zUCzM46&*`Xs7q_(q_Se+=CN1@z?hG-Ne#7`@*?GgnJng>GY_H5yy`R>4kPJF(_!_P z*K{;zI_AN0G>>^MSOb!1o5HcwwdaYDp4))fXy=U$>#h&FPINQJs4TfJOQyMSC(2I) z*Z34}EU*RBDi3UkCh;+$w5EM!Bv^ii;5Z(-bvY?x(E-m^cc>IW}=^RZNm)!EZ24{%Jmjq zRl3Zzycx~Qj%|_^{o&vz&w5R?xC35zO9@LPow)54GTPmF{pje`)4i96qodvJ?g}Kp zL=w26L`+BpBa0y@$-;DUFsfZeu>c>8P`TZ!0mTxUPdUDY_n$o9$2qsIoX`Gh4{M7h zg|*!}@@M?K8| literal 0 HcmV?d00001 diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE b/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE new file mode 100644 index 000000000..c0e32dd8e --- /dev/null +++ b/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE @@ -0,0 +1 @@ +GPLv2 \ No newline at end of file diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml b/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml new file mode 100644 index 000000000..d78d0c44d --- /dev/null +++ b/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml @@ -0,0 +1,3 @@ +license_expression: gpl-2.0 +is_license_reference: yes +relevance: 80 \ No newline at end of file diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE b/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE new file mode 100644 index 000000000..995ec316a --- /dev/null +++ b/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE @@ -0,0 +1,3 @@ +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). \ No newline at end of file diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml b/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml new file mode 100644 index 000000000..41746474c --- /dev/null +++ b/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml @@ -0,0 +1,6 @@ +license_expression: gpl-2.0 OR apache-2.0 +is_license_notice: yes +referenced_filenames: + - COPYING + - LICENSE.Apache +notes: seen in RocksDB diff --git a/scantext/tests/data/matched_text/index/rules/mit_101.RULE b/scantext/tests/data/matched_text/index/rules/mit_101.RULE new file mode 100644 index 000000000..722e438fd --- /dev/null +++ b/scantext/tests/data/matched_text/index/rules/mit_101.RULE @@ -0,0 +1,2 @@ +This source code is licensed under the MIT license found in the +LICENSE file in the root directory of this source tree. diff --git a/scantext/tests/data/matched_text/index/rules/mit_101.yml b/scantext/tests/data/matched_text/index/rules/mit_101.yml new file mode 100644 index 000000000..ca1a71366 --- /dev/null +++ b/scantext/tests/data/matched_text/index/rules/mit_101.yml @@ -0,0 +1,5 @@ +license_expression: mit +is_license_notice: yes +relevance: 100 +referenced_filenames: + - LICENSE \ No newline at end of file diff --git a/scantext/tests/data/matched_text/query.txt b/scantext/tests/data/matched_text/query.txt new file mode 100644 index 000000000..d5dc1521d --- /dev/null +++ b/scantext/tests/data/matched_text/query.txt @@ -0,0 +1,4 @@ +# This source code is licensed under both the Apache 2.0 license (found in the +# LICENSE file in the root directory of this source tree) and the GPLv2 (found +# in the COPYING file in the root directory of this source tree). +# You may select, at your option, one of the above-listed licenses diff --git a/scantext/tests/data/matched_text/spdx/query.txt b/scantext/tests/data/matched_text/spdx/query.txt new file mode 100644 index 000000000..0ef045154 --- /dev/null +++ b/scantext/tests/data/matched_text/spdx/query.txt @@ -0,0 +1,12 @@ +@REM ## @file +@REM # Makefile +@REM # +@REM # Copyright (c) 2007 - 2018, Intel Corporation. All rights reserved.
+@REM # SPDX-License-Identifier: BSD-2-Clause-Patent +@REM # + +@echo off +setlocal +set TOOL_ERROR=0 +SET NMAKE_COMMAND=%1 +SHIFT diff --git a/scantext/tests/data/matched_text/tokenize_matched_text_query.txt b/scantext/tests/data/matched_text/tokenize_matched_text_query.txt new file mode 100644 index 000000000..f4d5c8efa --- /dev/null +++ b/scantext/tests/data/matched_text/tokenize_matched_text_query.txt @@ -0,0 +1 @@ +the MODULE_LICENSE_GPL+ foobar \ No newline at end of file diff --git a/scantext/tests/data/matched_text/turkish_unicode/query b/scantext/tests/data/matched_text/turkish_unicode/query new file mode 100644 index 000000000..19adb4ef5 --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/query @@ -0,0 +1,20 @@ +# Licensed under the Apache License, Version 2.0 +next_label=İrəli + +Some stuff here +İ license MIT + +next_label=İrəli + + +İ license MIT + +Some stuff here +Some more stuff here + +# Licensed under the Apache License, Version 2.0 +next_label=İrəli + +lİcense MİT + +some more diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE new file mode 100644 index 000000000..f0ec0e607 --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE @@ -0,0 +1 @@ +İ license MIT \ No newline at end of file diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml new file mode 100644 index 000000000..864a8c3ca --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml @@ -0,0 +1 @@ +license_expression: mit diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE new file mode 100644 index 000000000..7ca4781d2 --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE @@ -0,0 +1,2 @@ +# Licensed under the Apache License, Version 2.0 +next_label=İrəli diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml new file mode 100644 index 000000000..a4f80f07b --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml @@ -0,0 +1 @@ +license_expression: apache-2.0 diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE new file mode 100644 index 000000000..7b767dbba --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE @@ -0,0 +1 @@ +Licensed under the Apache License, Version 2.0 diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml new file mode 100644 index 000000000..1443a0848 --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml @@ -0,0 +1 @@ +license_expression: proprietary-license diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE new file mode 100644 index 000000000..d00dc0e28 --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE @@ -0,0 +1 @@ +lİcense MİT \ No newline at end of file diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml new file mode 100644 index 000000000..864a8c3ca --- /dev/null +++ b/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml @@ -0,0 +1 @@ +license_expression: mit diff --git a/scantext/tests/data/matched_text/unicode_text/main3.js b/scantext/tests/data/matched_text/unicode_text/main3.js new file mode 100644 index 000000000..f0ec0e607 --- /dev/null +++ b/scantext/tests/data/matched_text/unicode_text/main3.js @@ -0,0 +1 @@ +İ license MIT \ No newline at end of file diff --git a/scantext/tests/test.py b/scantext/tests/test.py deleted file mode 100644 index 33eeb08c8..000000000 --- a/scantext/tests/test.py +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -from django.test import TestCase - -# Create your tests here. diff --git a/scantext/tests/test_match_text.py b/scantext/tests/test_match_text.py new file mode 100644 index 000000000..975a03c63 --- /dev/null +++ b/scantext/tests/test_match_text.py @@ -0,0 +1,1495 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import os + +from commoncode.testcase import FileBasedTesting +from licensedcode import cache +from licensedcode import index +from licensedcode import models +from licensedcode.index import LicenseIndex +from licensedcode.match import LicenseMatch +from scantext.match_text import Token +from scantext.match_text import get_full_matched_text +from scantext.match_text import reportable_tokens +from licensedcode.match import tokenize_matched_text +from licensedcode.models import Rule +from licensedcode.models import load_rules +from licensedcode.query import Query +from licensedcode.spans import Span + + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") + + +class TestCollectLicenseMatchTexts(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_get_full_matched_text_base(self): + rule_text = """ + Copyright [[some copyright]] + THIS IS FROM [[THE CODEHAUS]] AND CONTRIBUTORS + IN NO EVENT SHALL [[THE CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE + """ + + rule = Rule(stored_text=rule_text, license_expression="test") + idx = index.LicenseIndex([rule]) + + querys = """ + foobar 45 . Copyright 2003 (C) James. All Rights Reserved. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC dasdasda . + """ + result = idx.match(query_string=querys) + assert len(result) == 1 + match = result[0] + + # Note that there is a trailing space in that string + expected = """Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ + matched_text = "".join( + get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) + ) + assert matched_text == expected + + expected_nh = """Copyright 2003 (C) James. All Rights Reserved. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ + matched_text_nh = "".join( + get_full_matched_text( + match, query_string=querys, idx=idx, _usecache=False, highlight=False + ) + ) + assert matched_text_nh == expected_nh + + expected_origin_text = """Copyright 2003 (C) James. All Rights Reserved. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ + origin_matched_text = "".join( + get_full_matched_text( + match, + query_string=querys, + idx=idx, + highlight_not_matched="{}", + ) + ) + assert origin_matched_text == expected_origin_text + + def test_get_full_matched_text(self): + rule_text = """ + Copyright [[some copyright]] + THIS IS FROM [[THE CODEHAUS]] AND CONTRIBUTORS + IN NO EVENT SHALL [[THE CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE + """ + + rule = Rule(stored_text=rule_text, license_expression="test") + idx = index.LicenseIndex([rule]) + + querys = """ + foobar 45 Copyright 2003 (C) James. All Rights Reserved. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC + """ + result = idx.match(query_string=querys) + assert len(result) == 1 + match = result[0] + + # Note that there is a trailing space in that string + expected = """Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ + + matched_text = "".join( + get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) + ) + assert matched_text == expected + + # the text is finally rstripped + matched_text = match.matched_text(_usecache=False) + assert matched_text == expected.rstrip() + + # test again using some HTML with tags + # Note that there is a trailing space in that string + expected = """Copyright
2003
(
C
)
James
.
All

Rights

Reserved
. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE
best
CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ + matched_text = "".join( + get_full_matched_text( + match, + query_string=querys, + idx=idx, + highlight_not_matched="
{}
", + _usecache=False, + ) + ) + assert matched_text == expected + + # test again using whole_lines + expected = """ foobar 45 Copyright 2003 (C) James. All Rights Reserved. + THIS IS FROM THE CODEHAUS AND CONTRIBUTORS + IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC\n""" + matched_text = "".join( + get_full_matched_text( + match, + query_string=querys, + idx=idx, + highlight_not_matched="{}", + whole_lines=True, + ) + ) + assert matched_text == expected + + def test_get_full_matched_text_does_not_munge_underscore(self): + rule_text = "MODULE_LICENSE_GPL" + + rule = Rule(stored_text=rule_text, license_expression="test") + idx = index.LicenseIndex([rule]) + + querys = "MODULE_LICENSE_GPL" + result = idx.match(query_string=querys) + assert len(result) == 1 + match = result[0] + + expected = "MODULE_LICENSE_GPL" + matched_text = "".join( + get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) + ) + assert matched_text == expected + + def test_get_full_matched_text_does_not_munge_plus(self): + rule_text = "MODULE_LICENSE_GPL+ +" + + rule = Rule(stored_text=rule_text, license_expression="test") + idx = index.LicenseIndex([rule]) + + querys = "MODULE_LICENSE_GPL+ +" + result = idx.match(query_string=querys) + assert len(result) == 1 + match = result[0] + + expected = "MODULE_LICENSE_GPL+ +\n" + matched_text = "".join( + get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) + ) + assert matched_text == expected + + def test_tokenize_matched_text_does_cache_last_call_from_query_string_and_location( + self, + ): + dictionary = {"module": 0, "license": 1, "gpl+": 2} + location = None + query_string = "the MODULE_LICENSE_GPL+ foobar" + result1 = tokenize_matched_text(location, query_string, dictionary) + result2 = tokenize_matched_text(location, query_string, dictionary) + assert result2 is result1 + + location = self.get_test_loc("matched_text/tokenize_matched_text_query.txt") + query_string = None + result3 = tokenize_matched_text(location, query_string, dictionary) + assert result3 is not result2 + assert result3 == result2 + + result4 = tokenize_matched_text(location, query_string, dictionary) + assert result4 is result3 + + def test_tokenize_matched_text_does_return_correct_tokens(self): + querys = """ + foobar 45 Copyright 2003 (C) James. All Rights Reserved. THIS + IS FROM THE CODEHAUS AND CONTRIBUTORS + """ + dictionary = dict( + this=0, event=1, possibility=2, reserved=3, liable=5, copyright=6 + ) + result = tokenize_matched_text( + location=None, query_string=querys, dictionary=dictionary + ) + expected = [ + Token( + value="\n", + line_num=1, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="foobar", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="45", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Copyright", + line_num=2, + pos=0, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="2003", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" (", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="C", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=") ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="James", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="All", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Rights", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Reserved", + line_num=2, + pos=1, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="THIS", + line_num=2, + pos=2, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value="\n", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="IS", + line_num=3, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="FROM", + line_num=3, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="THE", + line_num=3, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="CODEHAUS", + line_num=3, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="AND", + line_num=3, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="CONTRIBUTORS", + line_num=3, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value="\n", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value=" \n", + line_num=4, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + ] + + assert result == expected + + def test_tokenize_matched_text_does_not_crash_on_turkish_unicode(self): + querys = "İrəli" + result = tokenize_matched_text( + location=None, query_string=querys, dictionary={} + ) + + expected = [ + Token( + value="i", + line_num=1, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value="rəli", + line_num=1, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value="\n", + line_num=1, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + ] + assert result == expected + + def test_tokenize_matched_text_behaves_like_query_tokenizer_on_turkish_unicode( + self, + ): + from licensedcode.tokenize import query_tokenizer + + querys = "İrəli" + matched_text_result = tokenize_matched_text( + location=None, query_string=querys, dictionary={} + ) + matched_text_result = [t.value for t in matched_text_result] + query_tokenizer_result = list(query_tokenizer(querys)) + + if matched_text_result[-1] == "\n": + matched_text_result = matched_text_result[:-1] + + assert matched_text_result == query_tokenizer_result + + def test_reportable_tokens_filter_tokens_does_not_strip_last_token_value(self): + tokens = [ + Token( + value="\n", + line_num=1, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="foobar", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="45", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Copyright", + line_num=2, + pos=0, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="2003", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" (", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="C", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=") ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="James", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="All", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Rights", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Reserved", + line_num=2, + pos=1, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="THIS", + line_num=2, + pos=2, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value="\n", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=3, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + ] + + match_qspan = Span(0, 1) + result = list( + reportable_tokens( + tokens, match_qspan, start_line=1, end_line=2, whole_lines=False + ) + ) + expected = [ + Token( + value="Copyright", + line_num=2, + pos=0, + is_text=True, + is_matched=True, + is_known=True, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="2003", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" (", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="C", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=") ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="James", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="All", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Rights", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Reserved", + line_num=2, + pos=1, + is_text=True, + is_matched=True, + is_known=True, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + ] + + assert result == expected + + # test again with whole lines + match_qspan = Span(0, 1) + result = list( + reportable_tokens( + tokens, match_qspan, start_line=1, end_line=2, whole_lines=True + ) + ) + expected = [ + Token( + value="\n", + line_num=1, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="foobar", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="45", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Copyright", + line_num=2, + pos=0, + is_text=True, + is_matched=True, + is_known=True, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="2003", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" (", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="C", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=") ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="James", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="All", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Rights", + line_num=2, + pos=-1, + is_text=True, + is_matched=False, + is_known=False, + ), + Token( + value=" ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="Reserved", + line_num=2, + pos=1, + is_text=True, + is_matched=True, + is_known=True, + ), + Token( + value=". ", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + Token( + value="THIS", + line_num=2, + pos=2, + is_text=True, + is_matched=False, + is_known=True, + ), + Token( + value="\n", + line_num=2, + pos=-1, + is_text=False, + is_matched=False, + is_known=False, + ), + ] + + assert result == expected + + def test_matched_text_is_collected_correctly_end2end(self): + rules_data_dir = self.get_test_loc("matched_text/index/rules") + query_location = self.get_test_loc("matched_text/query.txt") + rules = models.load_rules(rules_data_dir) + idx = LicenseIndex(rules) + + results = [ + match.matched_text(_usecache=False) + for match in idx.match(location=query_location) + ] + expected = [ + "This source code is licensed under both the Apache 2.0 license " + "(found in the\n# LICENSE", + "This source code is licensed under [both] [the] [Apache] [2].[0] license " + "(found in the\n# LICENSE file in the root directory of this source tree)", + "GPLv2 (", + ] + assert results == expected + + def check_matched_texts(self, test_loc, expected_texts, whole_lines=True): + idx = cache.get_index() + test_loc = self.get_test_loc(test_loc) + matches = idx.match(location=test_loc) + matched_texts = [ + m.matched_text(whole_lines=whole_lines, highlight=False, _usecache=False) + for m in matches + ] + assert matched_texts == expected_texts + + def test_matched_text_is_collected_correctly_end2end_for_spdx_match_whole_lines( + self, + ): + self.check_matched_texts( + test_loc="matched_text/spdx/query.txt", + expected_texts=["@REM # SPDX-License-Identifier: BSD-2-Clause-Patent"], + whole_lines=True, + ) + + def test_matched_text_is_collected_correctly_end2end_for_spdx_match_plain(self): + self.check_matched_texts( + test_loc="matched_text/spdx/query.txt", + expected_texts=["SPDX-License-Identifier: BSD-2-Clause-Patent"], + whole_lines=False, + ) + + def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query( + self, + ): + idx = cache.get_index() + querys_with_diacritic_unicode = "İ license MIT" + result = idx.match(query_string=querys_with_diacritic_unicode) + assert len(result) == 1 + match = result[0] + expected = "license MIT" + matched_text = match.matched_text( + _usecache=False, + ) + assert matched_text == expected + + def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_file(self): + idx = cache.get_index() + file_with_diacritic_unicode_location = self.get_test_loc( + "matched_text/unicode_text/main3.js" + ) + result = idx.match(location=file_with_diacritic_unicode_location) + assert len(result) == 1 + match = result[0] + expected = "license MIT" + matched_text = match.matched_text(_usecache=False) + assert matched_text == expected + + def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query_whole_lines( + self, + ): + idx = cache.get_index() + querys_with_diacritic_unicode = "İ license MIT" + result = idx.match(query_string=querys_with_diacritic_unicode) + assert len(result) == 1 + match = result[0] + expected = "[İ] license MIT" + matched_text = match.matched_text(_usecache=False, whole_lines=True) + assert matched_text == expected + + def test_matched_text_is_not_truncated_with_unicode_diacritic_input_with_diacritic_in_rules( + self, + ): + rule_dir = self.get_test_loc("matched_text/turkish_unicode/rules") + idx = index.LicenseIndex(load_rules(rule_dir)) + query_loc = self.get_test_loc("matched_text/turkish_unicode/query") + matches = idx.match(location=query_loc) + matched_texts = [ + m.matched_text(whole_lines=False, highlight=False, _usecache=False) + for m in matches + ] + + expected = [ + "Licensed under the Apache License, Version 2.0\r\nnext_label=irəli", + "İ license MIT", + "İ license MIT", + "Licensed under the Apache License, Version 2.0\r\nnext_label=irəli", + "lİcense mit", + ] + + assert matched_texts == expected + + def test_matched_text_is_not_truncated_with_unicode_diacritic_input_and_full_index( + self, + ): + expected = [ + "Licensed under the Apache License, Version 2.0", + "license MIT", + "license MIT", + "Licensed under the Apache License, Version 2.0", + ] + + self.check_matched_texts( + test_loc="matched_text/turkish_unicode/query", + expected_texts=expected, + whole_lines=False, + ) + + def test_matched_text_does_not_ignores_whole_lines_in_binary_with_small_index(self): + rule_dir = self.get_test_loc("matched_text/binary_text/rules") + idx = index.LicenseIndex(load_rules(rule_dir)) + query_loc = self.get_test_loc("matched_text/binary_text/gosu") + matches = idx.match(location=query_loc) + matched_texts = [ + m.matched_text(whole_lines=True, highlight=False, _usecache=False) + for m in matches + ] + + expected = [ + "{{ .Self }} license: GPL-3 (full text at https://github.com/tianon/gosu)" + ] + + assert matched_texts == expected + + def test_matched_text_does_not_ignores_whole_lines_in_binary_against_full_index( + self, + ): + expected = [ + "{{ .Self }} license: GPL-3 (full text at https://github.com/tianon/gosu)" + ] + self.check_matched_texts( + test_loc="matched_text/binary_text/gosu", + expected_texts=expected, + whole_lines=True, + ) + + def test_matched_text_is_collected_correctly_in_binary_ffmpeg_windows_whole_lines( + self, + ): + expected_texts = [ + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "%sconfiguration: --enable-gpl --enable-version3 --enable-dxva2 " + "--enable-libmfx --enable-nvenc --enable-avisynth --enable-bzlib " + "--enable-fontconfig --enable-frei0r --enable-gnutls --enable-iconv " + "--enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca " + "--enable-libfreetype --enable-libgme --enable-libgsm --enable-libilbc " + "--enable-libmodplug --enable-libmp3lame --enable-libopencore-amrnb " + "--enable-libopencore-amrwb --enable-libopenh264 --enable-libopenjpeg " + "--enable-libopus --enable-librtmp --enable-libsnappy --enable-libsoxr " + "--enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab " + "--enable-libvo-amrwbenc --enable-libvorbis --enable-libvpx " + "--enable-libwavpack --enable-libwebp --enable-libx264 --enable-libx265 " + "--enable-libxavs --enable-libxvid --enable-libzimg --enable-lzma " + "--enable-decklink --enable-zlib", + "%s is free software; you can redistribute it and/or modify\n" + "it under the terms of the GNU General Public License as published by\n" + "the Free Software Foundation; either version 3 of the License, or\n" + "(at your option) any later version.\n" + "%s is distributed in the hope that it will be useful,\n" + "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + "GNU General Public License for more details.\n" + "You should have received a copy of the GNU General Public License\n" + "along with %s. If not, see .\n" + "File formats:\n" + "D. = Demuxing supported\n" + ".E = Muxing supported\n" + "%s%s %-15s %s\n" + "Devices:\n" + "Codecs:\n" + "D..... = Decoding supported\n" + ".E.... = Encoding supported\n" + "..V... = Video codec\n" + "No option name near '%s'\n" + "Unable to parse '%s': %s\n" + "Setting '%s' to value '%s'\n" + "Option '%s' not found\n" + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libavfilter license: GPL version 3 or later", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libavformat license: GPL version 3 or later", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libavcodec license: GPL version 3 or later", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libpostproc license: GPL version 3 or later", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libswresample license: GPL version 3 or later", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libswscale license: GPL version 3 or later", + "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " + "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " + "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " + "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " + "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " + "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " + "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " + "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " + "--enable-lzma --enable-decklink --enable-zlib", + "libavutil license: GPL version 3 or later", + "This software is derived from the GNU GPL XviD codec (1.3.0).", + ] + + self.check_matched_texts( + test_loc="matched_text/ffmpeg/ffmpeg.exe", + expected_texts=expected_texts, + whole_lines=True, + ) + + def test_matched_text_is_collected_correctly_in_binary_ffmpeg_windows_not_whole_lines( + self, + ): + expected_texts = [ + "enable-gpl --enable-version3 --", + "enable-gpl --enable-version3 --", + "is free software; you can redistribute it and/or modify\n" + "it under the terms of the GNU General Public License as published by\n" + "the Free Software Foundation; either version 3 of the License, or\n" + "(at your option) any later version.\n" + "%s is distributed in the hope that it will be useful,\n" + "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + "GNU General Public License for more details.\n" + "You should have received a copy of the GNU General Public License\n" + "along with %s. If not, see .\n" + "File formats:\n" + "D. = Demuxing supported\n" + ".E = Muxing supported\n" + "%s%s %-15s %s\n" + "Devices:\n" + "Codecs:\n" + "D..... = Decoding supported\n" + ".E.... = Encoding supported\n" + "..V... = Video codec\n" + "No option name near '%s'\n" + "Unable to parse '%s': %s\n" + "Setting '%s' to value '%s'\n" + "Option '%s' not found\n" + "--enable-gpl --", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "enable-gpl --enable-version3 --", + "license: GPL version 3 or later", + "This software is derived from the GNU GPL XviD codec (", + ] + + self.check_matched_texts( + test_loc="matched_text/ffmpeg/ffmpeg.exe", + expected_texts=expected_texts, + whole_lines=False, + ) + + def test_matched_text_is_collected_correctly_in_binary_ffmpeg_elf_whole_lines(self): + expected_texts = [ + "--prefix=/usr --extra-version=0ubuntu0.1 --build-suffix=-ffmpeg " + "--toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu " + "--incdir=/usr/include/x86_64-linux-gnu --cc=cc --cxx=g++ --enable-gpl " + "--enable-shared --disable-stripping --disable-decoder=libopenjpeg " + "--disable-decoder=libschroedinger --enable-avresample --enable-avisynth " + "--enable-gnutls --enable-ladspa --enable-libass --enable-libbluray " + "--enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite " + "--enable-libfontconfig --enable-libfreetype --enable-libfribidi " + "--enable-libgme --enable-libgsm --enable-libmodplug --enable-libmp3lame " + "--enable-libopenjpeg --enable-libopus --enable-libpulse --enable-librtmp " + "--enable-libschroedinger --enable-libshine --enable-libsnappy " + "--enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora " + "--enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack " + "--enable-libwebp --enable-libx265 --enable-libxvid --enable-libzvbi " + "--enable-openal --enable-opengl --enable-x11grab --enable-libdc1394 " + "--enable-libiec61883 --enable-libzmq --enable-frei0r --enable-libx264 " + "--enable-libopencv", + "%sconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 " + "--build-suffix=-ffmpeg --toolchain=hardened " + "--libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu " + "--cc=cc --cxx=g++ --enable-gpl --enable-shared --disable-stripping " + "--disable-decoder=libopenjpeg --disable-decoder=libschroedinger " + "--enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa " + "--enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca " + "--enable-libcdio --enable-libflite --enable-libfontconfig " + "--enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm " + "--enable-libmodplug --enable-libmp3lame --enable-libopenjpeg " + "--enable-libopus --enable-libpulse --enable-librtmp --enable-libschroedinger " + "--enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex " + "--enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis " + "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 " + "--enable-libxvid --enable-libzvbi --enable-openal --enable-opengl " + "--enable-x11grab --enable-libdc1394 --enable-libiec61883 --enable-libzmq " + "--enable-frei0r --enable-libx264 --enable-libopencv", + "%s is free software; you can redistribute it and/or modify\n" + "it under the terms of the GNU General Public License as published by\n" + "the Free Software Foundation; either version 2 of the License, or\n" + "(at your option) any later version.\n" + "%s is distributed in the hope that it will be useful,\n" + "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + "GNU General Public License for more details.\n" + "You should have received a copy of the GNU General Public License\n" + "along with %s; if not, write to the Free Software\n" + "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA", + ] + + self.check_matched_texts( + test_loc="matched_text/ffmpeg/ffmpeg", + expected_texts=expected_texts, + whole_lines=True, + ) + + def test_matched_text_is_collected_correctly_in_binary_ffmpeg_static_whole_lines( + self, + ): + expected_texts = ["libswresample license: LGPL version 2.1 or later"] + self.check_matched_texts( + test_loc="matched_text/ffmpeg/libavsample.lib", + expected_texts=expected_texts, + whole_lines=True, + ) From 76e8a0383933bf9da464fbc2347fb60df1d3071f Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 9 Aug 2022 12:32:37 +0200 Subject: [PATCH 30/59] Make scancode match text test pass * Some adjustments were needed to ensure we could run these copied tests correctly. * See also this issue that requires to install SCTK locally in SCIO using "pip install --editable --- scantext/match_text.py | 3 --- scantext/tests/test_match_text.py | 25 ++++++++++--------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/scantext/match_text.py b/scantext/match_text.py index e43a60ca2..2c46cacfb 100644 --- a/scantext/match_text.py +++ b/scantext/match_text.py @@ -7,11 +7,8 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from enum import IntEnum -from itertools import groupby import attr -from attr import validators from licensedcode import query from licensedcode.spans import Span from licensedcode.stopwords import STOPWORDS diff --git a/scantext/tests/test_match_text.py b/scantext/tests/test_match_text.py index 975a03c63..6852bde77 100644 --- a/scantext/tests/test_match_text.py +++ b/scantext/tests/test_match_text.py @@ -13,16 +13,11 @@ from licensedcode import cache from licensedcode import index from licensedcode import models -from licensedcode.index import LicenseIndex -from licensedcode.match import LicenseMatch -from scantext.match_text import Token +from licensedcode.spans import Span from scantext.match_text import get_full_matched_text from scantext.match_text import reportable_tokens -from licensedcode.match import tokenize_matched_text -from licensedcode.models import Rule -from licensedcode.models import load_rules -from licensedcode.query import Query -from licensedcode.spans import Span +from scantext.match_text import Token +from scantext.match_text import tokenize_matched_text TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") @@ -39,7 +34,7 @@ def test_get_full_matched_text_base(self): EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE """ - rule = Rule(stored_text=rule_text, license_expression="test") + rule = models.Rule(stored_text=rule_text, license_expression="test") idx = index.LicenseIndex([rule]) querys = """ @@ -95,7 +90,7 @@ def test_get_full_matched_text(self): EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE """ - rule = Rule(stored_text=rule_text, license_expression="test") + rule = models.Rule(stored_text=rule_text, license_expression="test") idx = index.LicenseIndex([rule]) querys = """ @@ -159,7 +154,7 @@ def test_get_full_matched_text(self): def test_get_full_matched_text_does_not_munge_underscore(self): rule_text = "MODULE_LICENSE_GPL" - rule = Rule(stored_text=rule_text, license_expression="test") + rule = models.Rule(stored_text=rule_text, license_expression="test") idx = index.LicenseIndex([rule]) querys = "MODULE_LICENSE_GPL" @@ -176,7 +171,7 @@ def test_get_full_matched_text_does_not_munge_underscore(self): def test_get_full_matched_text_does_not_munge_plus(self): rule_text = "MODULE_LICENSE_GPL+ +" - rule = Rule(stored_text=rule_text, license_expression="test") + rule = models.Rule(stored_text=rule_text, license_expression="test") idx = index.LicenseIndex([rule]) querys = "MODULE_LICENSE_GPL+ +" @@ -1067,7 +1062,7 @@ def test_matched_text_is_collected_correctly_end2end(self): rules_data_dir = self.get_test_loc("matched_text/index/rules") query_location = self.get_test_loc("matched_text/query.txt") rules = models.load_rules(rules_data_dir) - idx = LicenseIndex(rules) + idx = index.LicenseIndex(rules) results = [ match.matched_text(_usecache=False) @@ -1150,7 +1145,7 @@ def test_matched_text_is_not_truncated_with_unicode_diacritic_input_with_diacrit self, ): rule_dir = self.get_test_loc("matched_text/turkish_unicode/rules") - idx = index.LicenseIndex(load_rules(rule_dir)) + idx = index.LicenseIndex(models.load_rules(rule_dir)) query_loc = self.get_test_loc("matched_text/turkish_unicode/query") matches = idx.match(location=query_loc) matched_texts = [ @@ -1186,7 +1181,7 @@ def test_matched_text_is_not_truncated_with_unicode_diacritic_input_and_full_ind def test_matched_text_does_not_ignores_whole_lines_in_binary_with_small_index(self): rule_dir = self.get_test_loc("matched_text/binary_text/rules") - idx = index.LicenseIndex(load_rules(rule_dir)) + idx = index.LicenseIndex(models.load_rules(rule_dir)) query_loc = self.get_test_loc("matched_text/binary_text/gosu") matches = idx.match(location=query_loc) matched_texts = [ From 8727cb7b81c082fc670744535137fb255c81c07b Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 9 Aug 2022 12:57:10 +0200 Subject: [PATCH 31/59] Add list of matches to Token Signed-off-by: Philippe Ombredanne --- scantext/match_text.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/scantext/match_text.py b/scantext/match_text.py index 2c46cacfb..f621872b0 100644 --- a/scantext/match_text.py +++ b/scantext/match_text.py @@ -7,7 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - import attr from licensedcode import query from licensedcode.spans import Span @@ -65,21 +64,44 @@ class Token(object): """ Used to represent a token in collected query-side matched texts and SPDX identifiers. + + ``matches`` is a lits of LicenseMatch to accomodate for overlapping matches. + For example, say we have these two matched text portions: + QueryText: this is licensed under GPL or MIT + Match1: this is licensed under GPL + Match2: licensed under GPL or MIT + + Each Token would be to assigned one or more LicenseMatch: + this: Match1 : yellow + is: Match1 : yellow + licensed: Match1, Match2 : orange (mixing yellow and pink colors) + under: Match1, Match2 : orange (mixing yellow and pink colors) + GPL: Match1, Match2 : orange (mixing yellow and pink colors) + or: Match2 : pink + MIT: Match2 : pink """ # original text value for this token. value = attr.ib() + # line number, one-based line_num = attr.ib() + # absolute position for known tokens, zero-based. -1 for unknown tokens pos = attr.ib(default=-1) + # True if text/alpha False if this is punctuation or spaces is_text = attr.ib(default=False) + # True if part of a match is_matched = attr.ib(default=False) + # True if this is a known token is_known = attr.ib(default=False) + # List of LicenseMatch that match this token + matches = attr.ib(attr.Factory(list)) + def tokenize_matched_text( location, From 3f7b64081294174510976f5025b9fbafb472eb1d Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Fri, 12 Aug 2022 15:29:17 +0530 Subject: [PATCH 32/59] Add all detected values into the table #450 Signed-off-by: Akhil Raj --- .../includes/license_summary_cards.html | 59 ++++++++++++++-- .../includes/license_summary_detail.html | 69 +++++++++++++++---- scantext/views.py | 8 ++- 3 files changed, 115 insertions(+), 21 deletions(-) diff --git a/scantext/templates/scantext/includes/license_summary_cards.html b/scantext/templates/scantext/includes/license_summary_cards.html index 88f63587b..f099bbbd5 100644 --- a/scantext/templates/scantext/includes/license_summary_cards.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -39,29 +39,74 @@
- - + + - - + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + +
Name{{ license.name }}Key{{ license.key }}
Score {{ license.score }}
Owner{{ license.owner }}Name{{ license.name }}
Short Name{{ license.short_name }}
Lines{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}
Category {{ license.category }}
SPDX KeyReference{{ license.reference_url }}
Exection{{ license.is_exception }}
Is Unknown{{ license.is_unknown }}
Owner{{ license.owner }}
Homepage{{ license.homepage_url }}
Text URL{{ license.text_url }}
Scancode Text URL{{ license.scancode_text_url }}
Scancode Data URL{{ license.scancode_data_url }}
SPDX License Key {{ license.spdx_license_key }}
Reference{{ license.reference_url }}SPDX URL{{ license.spdx_url }}
Lines{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}
diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index 25990c3a5..0f1d95d75 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -9,40 +9,85 @@
-
+

Detected Licenses

-
- - -
{% for license in detected_licenses.licenses %} - +

+ {% for rule in license.rules %} + {{ rule }} + {% endfor %} +

+
- - + + + + + + - - + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Name{{ license.name }}Matched Rule{{ license.rule }}
Key{{ license.key }}
Score {{ license.score }}
Owner{{ license.owner }}Name{{ license.name }}
Short Name{{ license.short_name }}
Category {{ license.category }}
SPDX Key{{ license.spdx_license_key }}Lines{% if license.start_line == license.end_line %} {{ license.start_line }} {% else %} {{ license.start_line }} - {{ license.end_line }} {% endif %}
Reference {{ license.reference_url }}
Exection{{ license.is_exception }}
Is Unknown{{ license.is_unknown }}
Owner{{ license.owner }}
Homepage{{ license.homepage_url }}
Text URL{{ license.text_url }}
Scancode Text URL{{ license.scancode_text_url }}
Scancode Data URL{{ license.scancode_data_url }}
SPDX License Key{{ license.spdx_license_key }}
SPDX URL{{ license.spdx_url }}
{% endfor %} diff --git a/scantext/views.py b/scantext/views.py index 5dcb703d2..d4c1dbf55 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -156,6 +156,7 @@ def get_licenses( detected_licenses = [] detected_expressions = [] + # gets matches from a license file matches = idx.match( location=location, min_score=0, @@ -167,6 +168,7 @@ def get_licenses( qspans = [] match = None complete_text_in_array = [] + # run through a list of matches for match in matches: qspans.append(match.qspan) @@ -244,6 +246,8 @@ def _licenses_data_from_match( result["reference_url"] = license_url_template.format(lic.key) result["scancode_text_url"] = SCANCODE_LICENSE_TEXT_URL.format(lic.key) result["scancode_data_url"] = SCANCODE_LICENSE_DATA_URL.format(lic.key) + result["rule"] = match.rule.license_expression + result["rules"] = match.rule.license_keys() spdx_key = lic.spdx_license_key result["spdx_license_key"] = spdx_key @@ -316,11 +320,11 @@ def get_highlighted_lines( } .not-matched { - color: #ac0000; + background-color: #ff0000; } .matched { - color: #00ac00; + background-color: #00ff00; }
From 658df1f8cba474748e06c8050eda616df01734a9 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Wed, 17 Aug 2022 12:35:17 +0530 Subject: [PATCH 33/59] Add details page to view matched license details #450 Signed-off-by: Akhil Raj --- .../includes/license_summary_cards.html | 49 ++------------ .../includes/license_summary_chart.html | 10 +++ .../includes/license_summary_detail.html | 58 ++++------------- .../license_summary_more_details.html | 65 +++++++++++++++++++ .../templates/scantext/license_summary.html | 31 +++++++++ 5 files changed, 121 insertions(+), 92 deletions(-) create mode 100644 scantext/templates/scantext/includes/license_summary_chart.html create mode 100644 scantext/templates/scantext/includes/license_summary_more_details.html diff --git a/scantext/templates/scantext/includes/license_summary_cards.html b/scantext/templates/scantext/includes/license_summary_cards.html index f099bbbd5..f80c3450e 100644 --- a/scantext/templates/scantext/includes/license_summary_cards.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -50,63 +50,22 @@ Name {{ license.name }} - - Short Name - {{ license.short_name }} - Lines {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} - - Category - {{ license.category }} - - - Reference - {{ license.reference_url }} - - - Exection - {{ license.is_exception }} - - - Is Unknown - {{ license.is_unknown }} - Owner {{ license.owner }} - Homepage - {{ license.homepage_url }} - - - Text URL - {{ license.text_url }} - - - Scancode Text URL - {{ license.scancode_text_url }} - - - Scancode Data URL - {{ license.scancode_data_url }} - - - SPDX License Key - {{ license.spdx_license_key }} - - - SPDX URL - {{ license.spdx_url }} + Category + {{ license.category }} - Lines - {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} + Reference + {{ license.reference_url }} -
diff --git a/scantext/templates/scantext/includes/license_summary_chart.html b/scantext/templates/scantext/includes/license_summary_chart.html new file mode 100644 index 000000000..13ddf0a54 --- /dev/null +++ b/scantext/templates/scantext/includes/license_summary_chart.html @@ -0,0 +1,10 @@ +
+
+

Detected License Expressions

+
+
+
+

License Expressions Scores

+
+
+
\ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index 0f1d95d75..d25e50542 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -1,11 +1,15 @@

Input License Text

- {% for text in detected_licenses.complete_text_in_array %} + +
+ {{ detected_licenses.complete_text|safe }} +
+
@@ -20,10 +24,6 @@

- - - - @@ -36,57 +36,21 @@ - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - + + - - + +
Matched Rule{{ license.rule }}
Key {{ license.key }}Name {{ license.name }}
Short Name{{ license.short_name }}
Category{{ license.category }}
Lines{% if license.start_line == license.end_line %} {{ license.start_line }} {% else %} {{ license.start_line }} - {{ license.end_line }} {% endif %}
Reference{{ license.reference_url }}
Exection{{ license.is_exception }}
Is Unknown{{ license.is_unknown }}{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}
Owner {{ license.owner }}
Homepage{{ license.homepage_url }}
Text URL{{ license.text_url }}
Scancode Text URL{{ license.scancode_text_url }}
Scancode Data URL{{ license.scancode_data_url }}
SPDX License Key{{ license.spdx_license_key }}Category{{ license.category }}
SPDX URL{{ license.spdx_url }}Reference{{ license.reference_url }}
diff --git a/scantext/templates/scantext/includes/license_summary_more_details.html b/scantext/templates/scantext/includes/license_summary_more_details.html new file mode 100644 index 000000000..1ee034635 --- /dev/null +++ b/scantext/templates/scantext/includes/license_summary_more_details.html @@ -0,0 +1,65 @@ +
+
+

Detected Licenses

+
+ {% for license in detected_licenses.licenses %} +
+
+

{{ license.name }}

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Rule{{ license.rule }}Key{{ license.key }}Score{{ license.score }}
Name{{ license.name }}Short Name{{ license.short_name }}Category{{ license.category }}
Line(s){% if license.start_line == license.end_line %} {{ license.start_line }} {% else %} {{ license.start_line }} - {{ license.end_line }} {% endif %}Reference{{ license.reference_url }}Execption{{ license.is_exception }}
Is Unknown{{ license.is_unknown }}Owner{{ license.owner }}Homepage{{ license.homepage_url }}
Text URL{{ license.text_url }}Scancode Text URL{{ license.scancode_text_url }}Scancode Data URL{{ license.scancode_data_url }}
SPDX License Key{{ license.spdx_license_key }}SPDX URL{{ license.spdx_url }}Matched Rule{{ license.matched_rule }}
+
+ {% endfor %} + +
diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index f95a7a96f..a6fc98e5d 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -18,6 +18,7 @@

License Detection Summary

@@ -30,12 +31,18 @@

License Detection Summary

{% include 'scantext/includes/license_summary_detail.html' with detected_licenses=detected_licenses %} + {% include 'scantext/includes/license_summary_chart.html' with detected_licenses=detected_licenses %} +
+ +
{% endblock %} {% block scripts %} + +{{ detected_licenses|json_script:"detected_licenses" }} + {% endblock %} From 8edf20776d9f8d66cc69f1aec86e93ae076def67 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Wed, 17 Aug 2022 12:36:30 +0530 Subject: [PATCH 34/59] Add a mini licenses file to test for development #450 Signed-off-by: Akhil Raj --- scantext/tests/data/licenses | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 scantext/tests/data/licenses diff --git a/scantext/tests/data/licenses b/scantext/tests/data/licenses new file mode 100644 index 000000000..5890f4498 --- /dev/null +++ b/scantext/tests/data/licenses @@ -0,0 +1,6 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +[This is the first released version of the Lesser GPL + From 787133a96a722514ba8fb15fa26cfb4f2a0183f6 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Wed, 17 Aug 2022 12:39:16 +0530 Subject: [PATCH 35/59] Move import of Token to top #450 Signed-off-by: Akhil Raj --- scantext/tests/test_match_text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scantext/tests/test_match_text.py b/scantext/tests/test_match_text.py index 6852bde77..2f46273fd 100644 --- a/scantext/tests/test_match_text.py +++ b/scantext/tests/test_match_text.py @@ -14,12 +14,12 @@ from licensedcode import index from licensedcode import models from licensedcode.spans import Span + +from scantext.match_text import Token from scantext.match_text import get_full_matched_text from scantext.match_text import reportable_tokens -from scantext.match_text import Token from scantext.match_text import tokenize_matched_text - TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") From 4a08ea9b8865e4e90e07602090f5b3f0384bf2a9 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Wed, 17 Aug 2022 12:53:01 +0530 Subject: [PATCH 36/59] Add `license_chart_data` to render charts #450 Signed-off-by: Akhil Raj --- scantext/views.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scantext/views.py b/scantext/views.py index d4c1dbf55..3f736b484 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -155,6 +155,7 @@ def get_licenses( detected_licenses = [] detected_expressions = [] + detected_expressions_with_scores = [] # gets matches from a license file matches = idx.match( @@ -173,7 +174,7 @@ def get_licenses( qspans.append(match.qspan) detected_expressions.append(match.rule.license_expression) - + detected_expressions_with_scores.append([match.rule.license_expression, match.score()]) detected_licenses.extend( _licenses_data_from_match( match=match, @@ -202,6 +203,7 @@ def get_licenses( [ ("licenses", detected_licenses), ("license_expressions", detected_expressions), + ("license_expressions_scores", detected_expressions_with_scores), ("percentage_of_license_text", percentage_of_license_text), ("complete_text_in_array", complete_text_in_array), ] From 32c3931ad1242fdbc63b0c70fce42ba54eb78d88 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Wed, 17 Aug 2022 12:59:03 +0530 Subject: [PATCH 37/59] Add eye friendly green color to match highlights #450 Signed-off-by: Akhil Raj --- .../scantext/includes/license_summary_detail.html | 8 ++++---- scantext/views.py | 15 +++------------ 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index d25e50542..0dec99a63 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -1,14 +1,14 @@

Input License Text

- -
+ {% endfor %} +
diff --git a/scantext/views.py b/scantext/views.py index 3f736b484..4f2172aa1 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -311,22 +311,13 @@ def get_highlighted_lines( tokens = tag_matched_tokens(tokens=tokens, match_qspan=match.qspan) header = """
From 4442f7c30e1122ca5783119f47ad0e3e6b997c5e Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Fri, 19 Aug 2022 00:30:13 +0530 Subject: [PATCH 38/59] Highlight all detected licenses in one match #450 Signed-off-by: Akhil Raj --- scantext/match_text.py | 7 -- .../includes/license_summary_cards.html | 2 +- .../includes/license_summary_detail.html | 119 ++++++++++-------- .../templates/scantext/license_summary.html | 9 +- scantext/views.py | 99 +++++++-------- 5 files changed, 117 insertions(+), 119 deletions(-) diff --git a/scantext/match_text.py b/scantext/match_text.py index f621872b0..910dee06a 100644 --- a/scantext/match_text.py +++ b/scantext/match_text.py @@ -174,13 +174,6 @@ def _tokenize_matched_text( # 2. to ensure the number of tokens is the same in both # tokenizers (though, of course, the case will differ as the # regular query tokenizer ignores case and punctuations). - - # NOTE: we have a rare Unicode bug/issue because of some Unicode - # codepoint such as some Turkish characters that decompose to - # char + punct when casefolded. This should be fixed in Unicode - # release 14 and up and likely implemented in Python 3.10 and up - # See https://github.com/nexB/scancode-toolkit/issues/1872 - # See also: https://bugs.python.org/issue34723#msg359514 qtokenized = list(index_tokenizer(token_str)) if not qtokenized: diff --git a/scantext/templates/scantext/includes/license_summary_cards.html b/scantext/templates/scantext/includes/license_summary_cards.html index f80c3450e..bdf6e3943 100644 --- a/scantext/templates/scantext/includes/license_summary_cards.html +++ b/scantext/templates/scantext/includes/license_summary_cards.html @@ -51,7 +51,7 @@ {{ license.name }} - Lines + Line(s) {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index 0dec99a63..245a663af 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -1,60 +1,79 @@

Input License Text

- {% for text in detected_licenses.complete_text_in_array %} -
- {{ text|safe }} -
- {% endfor %} - - +
-
-
-

Detected Licenses

-
+

Detected Licenses

+
{% for license in detected_licenses.licenses %} -

- {% for rule in license.rules %} - {{ rule }} - {% endfor %} -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Key{{ license.key }}
Score{{ license.score }}
Name{{ license.name }}
Lines{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}
Owner{{ license.owner }}
Category{{ license.category }}
Reference{{ license.reference_url }}
+
+
+
+ {% if license.homepage_url %} + {{ license.short_name }} {% else %} {{ license.short_name }} {% endif %} +
+
+

+ {% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %} +

+

{{ license.score }}

+

+ + + +

+
+
+ +
{% endfor %} -
+
diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index a6fc98e5d..1b05f3089 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -13,11 +13,10 @@

License Detection Summary

- @@ -25,11 +24,11 @@

License Detection Summary

{% include 'scantext/includes/license_summary_header.html' with detected_licenses=detected_licenses %}
-
- {% endfor %} + +
+ +
+
\ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_report.html b/scantext/templates/scantext/includes/license_report.html index 4d7fa8708..9c11410c2 100644 --- a/scantext/templates/scantext/includes/license_report.html +++ b/scantext/templates/scantext/includes/license_report.html @@ -1,3 +1,2 @@ - - Report - \ No newline at end of file +Report on Github \ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_summary_chart.html b/scantext/templates/scantext/includes/license_summary_chart.html deleted file mode 100644 index 13ddf0a54..000000000 --- a/scantext/templates/scantext/includes/license_summary_chart.html +++ /dev/null @@ -1,10 +0,0 @@ -
-
-

Detected License Expressions

-
-
-
-

License Expressions Scores

-
-
-
\ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_summary_detail.html b/scantext/templates/scantext/includes/license_summary_detail.html index b54ee6025..7587b1389 100644 --- a/scantext/templates/scantext/includes/license_summary_detail.html +++ b/scantext/templates/scantext/includes/license_summary_detail.html @@ -1,62 +1,57 @@
-
-

Input Text

-
{% for token in detected_licenses.license_tokens %}{{ token.value }}{% endfor %}
-
-
+

Detected Licenses

-
- {% for license in detected_licenses.license_matches %} -
-
-
{{ license.license_expression }}
-
-

{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}

-

{{ license.score }}

-

+

+ {% for license in detected_licenses.license_matches %} +
+
{{ license.license_expression }}
+
+

{{ license.score }}

+ -
-
-
+
+

Input Text

+
{% for token in detected_licenses.license_tokens %}{{ token.value }}{% endfor %}
+
+
\ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_summary_editor.html b/scantext/templates/scantext/includes/license_summary_editor.html deleted file mode 100644 index c7752c6fa..000000000 --- a/scantext/templates/scantext/includes/license_summary_editor.html +++ /dev/null @@ -1,67 +0,0 @@ -
-
-

Input Text

-
{{ text }}
-
-
-

Detected Licenses

-
- {% for license in detected_licenses.license_matches %} -
-
-
{{ license.license_expression }}
-
-

{% if license.start_line == license.end_line %} Line {{ license.start_line }} {% else %} Lines {{ license.start_line }} - {{ license.end_line }} {% endif %}

-

{{ license.score }}

-

- - - -

-
-
- -
- {% endfor %} -
-
-
\ No newline at end of file diff --git a/scantext/templates/scantext/includes/license_summary_header.html b/scantext/templates/scantext/includes/license_summary_header.html index 916c5d013..b9f83fa87 100644 --- a/scantext/templates/scantext/includes/license_summary_header.html +++ b/scantext/templates/scantext/includes/license_summary_header.html @@ -12,9 +12,6 @@

License Expressions

- - - {{ detected_licenses.license_matches|length }}

@@ -27,4 +24,4 @@

- + \ No newline at end of file diff --git a/scantext/templates/scantext/license_scan_form.html b/scantext/templates/scantext/license_scan_form.html index 0b62ef259..0c55122d8 100644 --- a/scantext/templates/scantext/license_scan_form.html +++ b/scantext/templates/scantext/license_scan_form.html @@ -52,4 +52,4 @@

Scan License

} -{% endblock %} +{% endblock %} \ No newline at end of file diff --git a/scantext/templates/scantext/license_summary.html b/scantext/templates/scantext/license_summary.html index f538b1e96..650065694 100644 --- a/scantext/templates/scantext/license_summary.html +++ b/scantext/templates/scantext/license_summary.html @@ -4,6 +4,15 @@ {% block extrahead %} {% endblock %} @@ -11,130 +20,37 @@
{% include 'scanpipe/includes/navbar_header.html' %} -
-
-

License Detection Summary

- New Scan -
-
- - {% include 'scantext/includes/license_summary_header.html' with detected_licenses=detected_licenses %}
- {% include 'scantext/includes/license_summary_editor.html' with detected_licenses=detected_licenses %} -
- - -
{% endblock %} {% block scripts %} - - -{% endblock %} +{% endblock %} \ No newline at end of file From 5c883673e58654624cd178b539708942ae404a18 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Wed, 7 Sep 2022 20:22:42 +0530 Subject: [PATCH 47/59] Remove unused tests and validate code format #450 Signed-off-by: Akhil Raj --- scantext/tests/data/licenses | 5 - .../tests/data/matched_text/binary_text/gosu | Bin 1712 -> 0 bytes .../binary_text/rules/gpl-3.0_rdesc_1.RULE | 1 - .../binary_text/rules/gpl-3.0_rdesc_1.yml | 2 - .../tests/data/matched_text/ffmpeg/ffmpeg | Bin 6136 -> 0 bytes .../tests/data/matched_text/ffmpeg/ffmpeg.exe | Bin 16136 -> 0 bytes .../data/matched_text/ffmpeg/libavsample.lib | Bin 1783 -> 0 bytes .../index/rules/gpl-2.0_bare_single_word.RULE | 1 - .../index/rules/gpl-2.0_bare_single_word.yml | 3 - .../index/rules/gpl-2.0_or_apache-2.0_2.RULE | 3 - .../index/rules/gpl-2.0_or_apache-2.0_2.yml | 6 - .../matched_text/index/rules/mit_101.RULE | 2 - .../data/matched_text/index/rules/mit_101.yml | 5 - scantext/tests/data/matched_text/query.txt | 4 - .../tests/data/matched_text/spdx/query.txt | 12 - .../tokenize_matched_text_query.txt | 1 - .../data/matched_text/turkish_unicode/query | 20 - .../turkish_unicode/rules/rule1.RULE | 1 - .../turkish_unicode/rules/rule1.yml | 1 - .../turkish_unicode/rules/rule2.RULE | 2 - .../turkish_unicode/rules/rule2.yml | 1 - .../turkish_unicode/rules/rule3.RULE | 1 - .../turkish_unicode/rules/rule3.yml | 1 - .../turkish_unicode/rules/rule4.RULE | 1 - .../turkish_unicode/rules/rule4.yml | 1 - .../data/matched_text/unicode_text/main3.js | 1 - scantext/tests/test_match_text.py | 1490 ----------------- scantext/tests/test_views.py | 6 +- scantext/views.py | 8 +- 29 files changed, 10 insertions(+), 1569 deletions(-) delete mode 100644 scantext/tests/data/licenses delete mode 100644 scantext/tests/data/matched_text/binary_text/gosu delete mode 100644 scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE delete mode 100644 scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml delete mode 100644 scantext/tests/data/matched_text/ffmpeg/ffmpeg delete mode 100644 scantext/tests/data/matched_text/ffmpeg/ffmpeg.exe delete mode 100644 scantext/tests/data/matched_text/ffmpeg/libavsample.lib delete mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE delete mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml delete mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE delete mode 100644 scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml delete mode 100644 scantext/tests/data/matched_text/index/rules/mit_101.RULE delete mode 100644 scantext/tests/data/matched_text/index/rules/mit_101.yml delete mode 100644 scantext/tests/data/matched_text/query.txt delete mode 100644 scantext/tests/data/matched_text/spdx/query.txt delete mode 100644 scantext/tests/data/matched_text/tokenize_matched_text_query.txt delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/query delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE delete mode 100644 scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml delete mode 100644 scantext/tests/data/matched_text/unicode_text/main3.js delete mode 100644 scantext/tests/test_match_text.py diff --git a/scantext/tests/data/licenses b/scantext/tests/data/licenses deleted file mode 100644 index ea5c84937..000000000 --- a/scantext/tests/data/licenses +++ /dev/null @@ -1,5 +0,0 @@ -Apache-2.0 - -MIT - -Lesser GPL \ No newline at end of file diff --git a/scantext/tests/data/matched_text/binary_text/gosu b/scantext/tests/data/matched_text/binary_text/gosu deleted file mode 100644 index 61b925fe68c8f0f791a440e370e9b61dfc0684ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1712 zcmbVM%WD%s7@t108fhC8@j6gwRcJHsnJrRsX)Rc=BG!WlO_OaBOtYoC@j)rgv3S*E z4<35;;HmB1gU2GG$Mz!hk5F)CXJ*|5q2R#I?|c2e@0-WY?)=ii;J^UzV-U;%qn+%| zr>3%VeEAy!%0Ph;J&%DQa9R6@_2=WtPV%_tcpgE~CNv!L?T=f1f4rOu9&6SNJepWD zOu9>{%m!%K4%T?55s%X8ShcO%t3=6Pm~V0psEFIRi8yJ8o(7Ytdu zOaJI+QcHQ(df|Mn+5XxDbMHsICq>w!0npXD+rb`B`QTQnzwNQwT6xn`t>o*U9T*D$XO3UHo~qdd zGsy}S4SD|Jq2(_g+WH@iGdKG8{e|}B`R7NLe|}{7Ich5Qr30NlJ~*~~aBTVAch`DS zz#a{NDaX(DFuSJ{X=H!AJJu|VIUUoJ=R?Pb_TE#(5Jv(@q)>nn#+YD=8Rl4Ei4_hA zBA5_D2_u{cB8egaMHEv)DP@#XK_yi*V2EKx7-fucCYWT31srkA38$QK&IOlT@jxJf z1rby*!G#b~C=o~`u_Tg8Cb<++N+kn@6jnr0#S~XUNu^W}=oSNAKG3NHO$)TC@FcF( z!&$h!4QC#Qt#!Dw13PiJH63q;RalKSHY)8Jd{)`2$IlB8!m!>On=~u!s14UDal^RH z$fs&YYf{b*dX$+jM!l_q{sIt+7lasJnoB9_EBZOwL_7)2HWG}<5xEZx=11nb1 z{r1GPn$@r!>jm9kUYe%x_Ijt)f=T!~ft3U{l4LWUEtTrcq|sTM(W@)j{!8^J?%XK= zu=>^fumEfxFIfEbAjOkUODV!{RzLe;vH0ETAFaRW;x-+gtXjLj*8Yk1&w(zP4u8Ey Woj&H!KRf=B@X!5K=P!S@{enMJslW9A diff --git a/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE b/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE deleted file mode 100644 index 3c0984a8e..000000000 --- a/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.RULE +++ /dev/null @@ -1 +0,0 @@ -License: GPL-3 diff --git a/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml b/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml deleted file mode 100644 index 8f2188c97..000000000 --- a/scantext/tests/data/matched_text/binary_text/rules/gpl-3.0_rdesc_1.yml +++ /dev/null @@ -1,2 +0,0 @@ -license_expression: gpl-3.0 -is_license_tag: yes diff --git a/scantext/tests/data/matched_text/ffmpeg/ffmpeg b/scantext/tests/data/matched_text/ffmpeg/ffmpeg deleted file mode 100644 index c06345a809dba9dab8c95e37505411701fdd65c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6136 zcmeHL-EQN!71m6D(E1g+7sM9sWU!HJXR<+d23d@g*fRza+pwM44vHe6B+6z-5>-)h z?7^b@4t8%_p;w1rP#`5+7}oFvBbl}!^1=Jd>{G4;A;5v$rGo3o;u$; zRQmTnUK^49b)190|H72%=e2X{{1(4oIKOj#jqsb0Wx9U<#?(3vS^Fja zPoG-B>C@Ndqkey3#puI6wTJYk{&vsn>eIge`Of-Tza58}dTK|${utQ$AA`Wi>-VK= ziq20R`>xwL49|s15|uKsDMYHJO7%znjLs_xRd{#jc;gLG)M z+uHgvO0%-im1FSZ_wA7SWe08j+-i__sY(Sq(5lP>QQ50%CuPwzQ0vO{!%nl~MNyK8 zr868}PX}*qFZ+xBx`PO0;bUh!OXDy_C|>rPv{h+q_DxzM6;D%noZ&vU+I zg;LCyOAPEnm2rsP3&!JE1jcnF_R?2_90-UIXTNp2HGe1S?qL+RpT+%)>^^*O7FVvGRFbt(|^ZC^r;^ z2nyJ+k<~0xv{o=1&IM0tD6!NnCnNi9LIGL$4G9@DB8?* zu~2jFu0>9Z^{iX^Wm=S7uZJqXlyT^4gb(Vjof3!<={Q(%nVw;)LXpC4*Nvqg%G?<3 z;JdRqJ-IzOhBz%Z?lLW*BGcer1>bQQipSJ~;4BEB9&9#e%dft|mq@uE3wN2tyMkFJ z)Nv@asnta;vy3KaC&S*s{>jhakjg~*7c9KTYjv>7m7wXx{<-gvud(7dg?V2l^)@Y5 zI|&9?#Clhbd8jkKSC&54`}NZIZDOlRG7tO(|D4-h$-Rhx7!Q$&I^^5%_wAL ztefrfBFP$t9-kKsNw2Ueo2n_#vSv)R+U(6HwaSFpG+XJtnS=(my~>+$FYeUO2Dnza zFPo!TXHCm@e4X*&L!-Al;Wx*(IXd~ep|L^QZwNlF{k=J&!1BFaK;`9LdDH7HbKYD? zVbFW=&2hu16v0XF^z=nT_%ZqGu8bukyUpOi*QOQfJXqHdIOH-$CMXokc#e4;4rhbu zWVT=;UCUghl&RR7=aP0SAq#RAJ-`j7B9@3mvRHP?4K;F$FgZYDRkQF7m|?r@Z@d5moqtEHVCnsJX!mDUmwWJah>Skp4` zfsC*QD9)7oZ4~pRW`AG@-OX_q5Br{VH>dRV+yV8=wz&!WLGYZlvF5NJ;EtU+7Sd9{ zqRq99*Ag)IwRh}w*};uLU0ZTbEC%=f);saK$6oK~IXmk1PP!-EZ%o6swmBXcLi6_$ zyY@4rTN1Db_>V)!XGs4H=|K6F2JA`(01#lKW*JE|9n$=5 z%cNxLeO~p>h6+|qGJVAaun`?p2O@S&^I9!C z%-v{0Cc!TN2PF5$4f;%h@Ngj~ZRj&%Ym z{(ont#bX7)H*BP>UojaGt{*ZYV{k~BXbtygsk1D%?Ch5*(H&@O96UfC`ipLz|;n) zz}?LQvFp-31K@^O;$_VOFX_DbK+_vkd~lFHQt1)HtofzoNyH9`XF_UN=1xj%6VjS% z?kc>IsTid=%41^@?^G!WmEm1v&-1RR^-^|&#e?6ePy!siR-(F0v zr<3tuyqL4|(PDl(9n5a0qdyL=T4w%j1lmFU$B^Yubk)8c)`sYf%wNZAl!g+=*%BGK zI^-Z~4?4db%r9o6>0&e)*Ug;PyujpR+;y}2v(e968^`CPMtgFjzD3p5xy){b>LRsT zr=+KwE3jU1f(AkL^n}$aItT?$oc0dTW?*|2NcW@kELH?-_EvQT(5Y6hSqMeF1+dsv zxm=ml_RL<6JgZ=&K1tQ*lWOkjo7}wi{^9**b#$f-T{WrvstjIB>~-W>o-fPfUv}{S W`N6#H{q~dk!ra!GJN@d@y#5O*7h!Dx diff --git a/scantext/tests/data/matched_text/ffmpeg/ffmpeg.exe b/scantext/tests/data/matched_text/ffmpeg/ffmpeg.exe deleted file mode 100644 index 5a9b37bdb1d6e965201cbd5d028ae82b9333203f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16136 zcmeI3OK%)kcE>Me1BrX$L4bG>WRT32koJ%cyGcp5<4_jIG%3mi^dLyemZK4*>Z;q_ zWz|dd=xz$eAd9R51n?psAxN^w#=syLWHn!4fP8~EK$da7K#(ZEb8l5YD2-^(x|{5- zs{4Na&;Oiz>h@oLrCw7?z0UXfbESUCRX!it-&g!)8|XlnaL zZM^n_l?y*o|M!n8>e7!csK5C;_03zv{yty*Pb;={a!(FvHv#s_MX7(I90LFLFaG$# zYw9DSw96-%B@dta;vODywW2q`1{w@PF1Fzz?8-> zO`wY~c~a8R|bL44-9h!^6+s5jSLie<}XQ8idd6lYf>?@U2y#z3XH>3DIT zEVgVuYW!l<@XGt|E)*~6zsSsDooQArj!9m|*+RA=@v>}jkcxCNH}3SN$c&krT|&yC zN%O^lE63>z-5Z5LQF@DwAEnaSac^OTP~<4hdm!0#(MeFYSnJpuWuE_JA46>G0J5asQulTBdK&ULMK@++q{ARG%HKA~Q0P zI!H!dL^E@frFp3(M%$Q{x|cjj(s81nm{a|ecJULz{ibl)-KxF$ZqVMmV!xf{My~2^ zRqC{_V-u(OsqR%_6okoOp^}6(+fw8C_RCUh(p+j6mt13kASOE({1ZT;PFt7#v8Y6a zTT-k2X&t$ajPtN8DUM;0qCmm4iUKWC%!{Jbr&0aB$$`fk?34f`QhTPp*;ie4tnRCi z)!t%HES3rM>QB4!(bx5-VPMjh-f5n9svzX~xOqOVdSzHfrlq_4b(Vc3s_EE4)^QaP zJ(}rW~e$zT9F($_Lh&8AW(L&I{6D$}GGds3br4?}+_osl^jGbk*#c3*e+ zvEx21KTJw6%1Oal@f5*aHfb$PWop$H)pBLIz zU@LJ8rdcnF%=|xS7Y`A7n$Jy>nB0qCO_|pVqp%E3v89zAe;IzQg8Z^pC0xqq-b6?f zRm7}IE>*8D>!Z+$N!7|9iU8A!z3s*R&`SnJt7bP-hFv-&PSV8K%1@fbbhdh1td{Bv zSzF3JD|hcfGi>g9Ik4AkG_stgGwHzwKt$J?^+JLyoR~-kD)vg7h9$9(pzwr-=%$R9 z#Z7aweik!JR};!-Y|=`~7lCHCDNEkn+)1@tfdVDX{8`4sW|xiOK&#LW#z2Do{TNBM zZWm3G%B;yM3W0>&Xm13N3PyJ8bUF?P!%`=P&XQTkFIC5xr9-knwHE|TZkjOHP(YK* z4WOL^wx$4D&a^`owiGc)!e^A(JFP1s%5nRk<&qQf5D5ym`zate%rF9iegWc>kgV?rnbzJ$r zDT;m-(KaheA-yk*0i_0}R}EC(U6eYa#fw$QVo6yZZgt9zi>nEU$XYhJ<=$f<;6nwA z8OzU59v{&cwvs$Iw`$LVGVQ7CloaE__vmLJP%qTE*l?Me8EW9OaHwvAbO#%L?J6G#&AW ztOE<}G85;oT}ic=&q{&WT9?vEG@quPR$H%KiHj3k=U6Ov=hJ%i+FUQs;6_6h!Q9Ng zBK`Mh(3yNo0Uk&4fK{^g1 z{HUuBQ=kR(bL_9MJJv~ZSusd9N28urp(&9@i4aos)JPdwIcn_X35`}^(oYrpwU-tK z3Fjebnu=&=mUgx+jn!}K)y>Jfn|$<_dUNuD+^wmRay(QuJQD=9>kU{<42}316y#?U z*xp19VP3*!khRl3#opwRqj$DbcHMQ@^n$!|O0gS9aOCc>5Lt(OwKmGL#T#~)zS_Og zy&`K@@21&lZpFc>zlJ#Mw2=vH2PaWnxi?ii4yzsnU*DqT2~r;soJok4553ctg)OQ5xUOCAi+X1D zo@?5`YpI|HLw&WWpe}74Sw~@ss&!bH(H{=WGTYkN7>~zYOqAU;A8d%)#*7vl+YklhrMiLOM`F9>ijKtE zYUw@E{Pzr$Q%J;=38M4gAma85!A`TsQj2PT`90BPTd20O?kVv=-4gj{^(nD??1^o` zRlsI>p@KTnnpDn4cEx5w3#`S2A!1L99b%^|6{!FEgKt;V?<)0G|0DI^a?j^~>M&h= zr2d!t-+AlXl{eq|*4n*L{%Z1PwZFu8xz%cfT!|7VYKA58Hq>qK$&GX<$EL=7yFHyOZVUyP7r@7z9YR!X5i#@0oQYvBm27IUcXdLrlv6H1 zvM$kPo3Cy@yeWGwtflUN6k@h>l}M7>d?5xUSOFRV%Db5LkqfKsU=85*!`!bT;HoYO zOBa%{Ipb?7)aMOBzZXBA=c~g!MO<2~wZMvWy)7y}hZm+SxStjSir0zc7>F2mn&4L|4Lujp)ZMhHDIUR-Q%DmvJ+C~re#7P8Eq5Yluo6a%ic_DC{k z>~&d5*s^&UlUIb)2ahc!M^frwkYK*1Kn57GpCc&~MmUT;4jAfE>@h)!ZKO6Cf0sO% zyQeu$#2^a~q82x#BEw~rrEAEGb`;TaE4dzFtuBq6yJEVljj(D8k4s509ST<5QjuFK zat()ixgrN%%pudclHwvJ($4moeW>4lTe}EHhi5x<76THMsh>u#jO#_fsomMLlhvR? z*8m!fwy_)2-QQCC7EP^%2}GJk)MQlcl%;1ap;_pc6;gpjQ_#~xl(Votffb-x*I|uw z;5B{B&C|+Y4|Wjx;dQ+!Qeh9lk^!DTdtPO7Fj~XaYF+Ym0Bfx^wTjiDA*$oVf-hyv z`UaZIfv9xORpVrVJsO#|bXM5Cll{&IqRTOraCjiRKHuKiPCGd=+}(Br7ogsmy^ymr*!tvq_gene1s_KY>WGg{_;x>WL}Yo49J+osLGvEJJ%ID zwrh`mHYe>#V+^nLPBHAWw2BfLj1}=gJ9sFAIYp`^XitY?3EE3~|DEZ*#o==?dtEm> zRpG-@rWy1%hdpsnR;F?NAi)k)Y@}W-#vB!zU`^=a0FzsRfu+gv@`TQsk8^^!C9Df^ z%ME@Wpbr#O4%XY^xd1!f;M4(UlKWzzS0w9YK5c9l_GhDA@NaL44<>C6zoGA3*OY#6 zjPQRUW*@ALePSPVo+st4~N&#Y8CUy?{A)NKSpWr}& zwnw+ZKy}kIph@VdsKD;LSN-bw^XLD*11{YZpc21fcG2woV`FxUA2DFl+0G&_!MuS- zahNnu=U5hOzFt-|nYeXj`9p|Zg)>)OTJtPznbMUdbDIvulDRFJ+jnGcvyFQ$oTYxG z(xT5dS0VQy_aXPCT=8Q9k5t8Xrc{cnnS(BI60%AxM97&*YrjCJrbDDePpg$+OHyX( zfd%meJ1@f&l&p8uMq}6%g?iufODxRWD9*2bsbNz&bL^F4vG+;(I865KsqZwzrIf@!G~$F_8@$yw?(5r8~CB7L!YS^ z!v9Qvp*YI`9IQH`&s|;Z_`P6~%+CYu$%^@%Wcl`x*zlDZg1#z3EUasos+6eMhS2eBaKc zc{Pu6si!pR;wATnm~pR^+5^!`?9JIn-gT81kDIT=$UX1$E_}tQN5Yz`eZ{!(ffE)e zZyt#URUfU^9^_k(EZha|kHS6YGNf05&F&@#>G)sau-^EvO&!g^DLGZ)xOPBUVj2^j zmdB9ypbGL}2o&hy6hPCK-{boYDz03ATtELu{ro39|A4Pt`K+w0*rvOu@BMp#x#I`i=MbM%+v-VAxqgeDHceH(gXTW+3-CR7gPjNtluDMh2#vUq^&bg$` ztbv`;$O;pD)ZP)FGAYk$`g8AT;AWktkXCs^0N@rM!=ad36CA%wy!PbXIBbqBjb(sS z8fzn%wlPiCt-~TGJ>-%>minI1K=;01aaxon?vnY0u6cEEhgXS8IbxDG%EjBw<;C+a z(6>!2u^+roo>g&qqa+v5rPei4@3>*JXZ4)Q%j-L-e?!xSC zbl0lTn>IqNj|iP%gwDVG7irzTK&{F5f0K6>wJtvVR_-E%KDQ8BLPvKHnt)tB??G8Y zx8PlAW$5c1xx9d!FTPDog)VtN&Cw<2*&len&5;kcqC>5vi*ja;C($?)GWM2p7bQt* z_#|?cMFZOF(P2V1*ZTdzaAW;mca=#_RV2mO%`A5`HIhfQy2{XwbKH8N74A5kP$7AY zsRv`L6Sol|#_~iI+YxK5ZzwyaldIK_is69Fgo^dd5$CBFjHEHidxJhE#6#h1N`=}^ zUCzM46&*`Xs7q_(q_Se+=CN1@z?hG-Ne#7`@*?GgnJng>GY_H5yy`R>4kPJF(_!_P z*K{;zI_AN0G>>^MSOb!1o5HcwwdaYDp4))fXy=U$>#h&FPINQJs4TfJOQyMSC(2I) z*Z34}EU*RBDi3UkCh;+$w5EM!Bv^ii;5Z(-bvY?x(E-m^cc>IW}=^RZNm)!EZ24{%Jmjq zRl3Zzycx~Qj%|_^{o&vz&w5R?xC35zO9@LPow)54GTPmF{pje`)4i96qodvJ?g}Kp zL=w26L`+BpBa0y@$-;DUFsfZeu>c>8P`TZ!0mTxUPdUDY_n$o9$2qsIoX`Gh4{M7h zg|*!}@@M?K8| diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE b/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE deleted file mode 100644 index c0e32dd8e..000000000 --- a/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.RULE +++ /dev/null @@ -1 +0,0 @@ -GPLv2 \ No newline at end of file diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml b/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml deleted file mode 100644 index d78d0c44d..000000000 --- a/scantext/tests/data/matched_text/index/rules/gpl-2.0_bare_single_word.yml +++ /dev/null @@ -1,3 +0,0 @@ -license_expression: gpl-2.0 -is_license_reference: yes -relevance: 80 \ No newline at end of file diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE b/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE deleted file mode 100644 index 995ec316a..000000000 --- a/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.RULE +++ /dev/null @@ -1,3 +0,0 @@ -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). \ No newline at end of file diff --git a/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml b/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml deleted file mode 100644 index 41746474c..000000000 --- a/scantext/tests/data/matched_text/index/rules/gpl-2.0_or_apache-2.0_2.yml +++ /dev/null @@ -1,6 +0,0 @@ -license_expression: gpl-2.0 OR apache-2.0 -is_license_notice: yes -referenced_filenames: - - COPYING - - LICENSE.Apache -notes: seen in RocksDB diff --git a/scantext/tests/data/matched_text/index/rules/mit_101.RULE b/scantext/tests/data/matched_text/index/rules/mit_101.RULE deleted file mode 100644 index 722e438fd..000000000 --- a/scantext/tests/data/matched_text/index/rules/mit_101.RULE +++ /dev/null @@ -1,2 +0,0 @@ -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. diff --git a/scantext/tests/data/matched_text/index/rules/mit_101.yml b/scantext/tests/data/matched_text/index/rules/mit_101.yml deleted file mode 100644 index ca1a71366..000000000 --- a/scantext/tests/data/matched_text/index/rules/mit_101.yml +++ /dev/null @@ -1,5 +0,0 @@ -license_expression: mit -is_license_notice: yes -relevance: 100 -referenced_filenames: - - LICENSE \ No newline at end of file diff --git a/scantext/tests/data/matched_text/query.txt b/scantext/tests/data/matched_text/query.txt deleted file mode 100644 index d5dc1521d..000000000 --- a/scantext/tests/data/matched_text/query.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This source code is licensed under both the Apache 2.0 license (found in the -# LICENSE file in the root directory of this source tree) and the GPLv2 (found -# in the COPYING file in the root directory of this source tree). -# You may select, at your option, one of the above-listed licenses diff --git a/scantext/tests/data/matched_text/spdx/query.txt b/scantext/tests/data/matched_text/spdx/query.txt deleted file mode 100644 index 0ef045154..000000000 --- a/scantext/tests/data/matched_text/spdx/query.txt +++ /dev/null @@ -1,12 +0,0 @@ -@REM ## @file -@REM # Makefile -@REM # -@REM # Copyright (c) 2007 - 2018, Intel Corporation. All rights reserved.
-@REM # SPDX-License-Identifier: BSD-2-Clause-Patent -@REM # - -@echo off -setlocal -set TOOL_ERROR=0 -SET NMAKE_COMMAND=%1 -SHIFT diff --git a/scantext/tests/data/matched_text/tokenize_matched_text_query.txt b/scantext/tests/data/matched_text/tokenize_matched_text_query.txt deleted file mode 100644 index f4d5c8efa..000000000 --- a/scantext/tests/data/matched_text/tokenize_matched_text_query.txt +++ /dev/null @@ -1 +0,0 @@ -the MODULE_LICENSE_GPL+ foobar \ No newline at end of file diff --git a/scantext/tests/data/matched_text/turkish_unicode/query b/scantext/tests/data/matched_text/turkish_unicode/query deleted file mode 100644 index 19adb4ef5..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/query +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed under the Apache License, Version 2.0 -next_label=İrəli - -Some stuff here -İ license MIT - -next_label=İrəli - - -İ license MIT - -Some stuff here -Some more stuff here - -# Licensed under the Apache License, Version 2.0 -next_label=İrəli - -lİcense MİT - -some more diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE deleted file mode 100644 index f0ec0e607..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.RULE +++ /dev/null @@ -1 +0,0 @@ -İ license MIT \ No newline at end of file diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml deleted file mode 100644 index 864a8c3ca..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule1.yml +++ /dev/null @@ -1 +0,0 @@ -license_expression: mit diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE deleted file mode 100644 index 7ca4781d2..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.RULE +++ /dev/null @@ -1,2 +0,0 @@ -# Licensed under the Apache License, Version 2.0 -next_label=İrəli diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml deleted file mode 100644 index a4f80f07b..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule2.yml +++ /dev/null @@ -1 +0,0 @@ -license_expression: apache-2.0 diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE deleted file mode 100644 index 7b767dbba..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.RULE +++ /dev/null @@ -1 +0,0 @@ -Licensed under the Apache License, Version 2.0 diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml deleted file mode 100644 index 1443a0848..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule3.yml +++ /dev/null @@ -1 +0,0 @@ -license_expression: proprietary-license diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE b/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE deleted file mode 100644 index d00dc0e28..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.RULE +++ /dev/null @@ -1 +0,0 @@ -lİcense MİT \ No newline at end of file diff --git a/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml b/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml deleted file mode 100644 index 864a8c3ca..000000000 --- a/scantext/tests/data/matched_text/turkish_unicode/rules/rule4.yml +++ /dev/null @@ -1 +0,0 @@ -license_expression: mit diff --git a/scantext/tests/data/matched_text/unicode_text/main3.js b/scantext/tests/data/matched_text/unicode_text/main3.js deleted file mode 100644 index f0ec0e607..000000000 --- a/scantext/tests/data/matched_text/unicode_text/main3.js +++ /dev/null @@ -1 +0,0 @@ -İ license MIT \ No newline at end of file diff --git a/scantext/tests/test_match_text.py b/scantext/tests/test_match_text.py deleted file mode 100644 index 2f46273fd..000000000 --- a/scantext/tests/test_match_text.py +++ /dev/null @@ -1,1490 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# ScanCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/scancode-toolkit for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# -import os - -from commoncode.testcase import FileBasedTesting -from licensedcode import cache -from licensedcode import index -from licensedcode import models -from licensedcode.spans import Span - -from scantext.match_text import Token -from scantext.match_text import get_full_matched_text -from scantext.match_text import reportable_tokens -from scantext.match_text import tokenize_matched_text - -TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") - - -class TestCollectLicenseMatchTexts(FileBasedTesting): - test_data_dir = TEST_DATA_DIR - - def test_get_full_matched_text_base(self): - rule_text = """ - Copyright [[some copyright]] - THIS IS FROM [[THE CODEHAUS]] AND CONTRIBUTORS - IN NO EVENT SHALL [[THE CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE - """ - - rule = models.Rule(stored_text=rule_text, license_expression="test") - idx = index.LicenseIndex([rule]) - - querys = """ - foobar 45 . Copyright 2003 (C) James. All Rights Reserved. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC dasdasda . - """ - result = idx.match(query_string=querys) - assert len(result) == 1 - match = result[0] - - # Note that there is a trailing space in that string - expected = """Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ - matched_text = "".join( - get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) - ) - assert matched_text == expected - - expected_nh = """Copyright 2003 (C) James. All Rights Reserved. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ - matched_text_nh = "".join( - get_full_matched_text( - match, query_string=querys, idx=idx, _usecache=False, highlight=False - ) - ) - assert matched_text_nh == expected_nh - - expected_origin_text = """Copyright 2003 (C) James. All Rights Reserved. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ - origin_matched_text = "".join( - get_full_matched_text( - match, - query_string=querys, - idx=idx, - highlight_not_matched="{}", - ) - ) - assert origin_matched_text == expected_origin_text - - def test_get_full_matched_text(self): - rule_text = """ - Copyright [[some copyright]] - THIS IS FROM [[THE CODEHAUS]] AND CONTRIBUTORS - IN NO EVENT SHALL [[THE CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE - """ - - rule = models.Rule(stored_text=rule_text, license_expression="test") - idx = index.LicenseIndex([rule]) - - querys = """ - foobar 45 Copyright 2003 (C) James. All Rights Reserved. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC - """ - result = idx.match(query_string=querys) - assert len(result) == 1 - match = result[0] - - # Note that there is a trailing space in that string - expected = """Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ - - matched_text = "".join( - get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) - ) - assert matched_text == expected - - # the text is finally rstripped - matched_text = match.matched_text(_usecache=False) - assert matched_text == expected.rstrip() - - # test again using some HTML with tags - # Note that there is a trailing space in that string - expected = """Copyright
2003
(
C
)
James
.
All

Rights

Reserved
. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE
best
CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ - matched_text = "".join( - get_full_matched_text( - match, - query_string=querys, - idx=idx, - highlight_not_matched="
{}
", - _usecache=False, - ) - ) - assert matched_text == expected - - # test again using whole_lines - expected = """ foobar 45 Copyright 2003 (C) James. All Rights Reserved. - THIS IS FROM THE CODEHAUS AND CONTRIBUTORS - IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC\n""" - matched_text = "".join( - get_full_matched_text( - match, - query_string=querys, - idx=idx, - highlight_not_matched="{}", - whole_lines=True, - ) - ) - assert matched_text == expected - - def test_get_full_matched_text_does_not_munge_underscore(self): - rule_text = "MODULE_LICENSE_GPL" - - rule = models.Rule(stored_text=rule_text, license_expression="test") - idx = index.LicenseIndex([rule]) - - querys = "MODULE_LICENSE_GPL" - result = idx.match(query_string=querys) - assert len(result) == 1 - match = result[0] - - expected = "MODULE_LICENSE_GPL" - matched_text = "".join( - get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) - ) - assert matched_text == expected - - def test_get_full_matched_text_does_not_munge_plus(self): - rule_text = "MODULE_LICENSE_GPL+ +" - - rule = models.Rule(stored_text=rule_text, license_expression="test") - idx = index.LicenseIndex([rule]) - - querys = "MODULE_LICENSE_GPL+ +" - result = idx.match(query_string=querys) - assert len(result) == 1 - match = result[0] - - expected = "MODULE_LICENSE_GPL+ +\n" - matched_text = "".join( - get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False) - ) - assert matched_text == expected - - def test_tokenize_matched_text_does_cache_last_call_from_query_string_and_location( - self, - ): - dictionary = {"module": 0, "license": 1, "gpl+": 2} - location = None - query_string = "the MODULE_LICENSE_GPL+ foobar" - result1 = tokenize_matched_text(location, query_string, dictionary) - result2 = tokenize_matched_text(location, query_string, dictionary) - assert result2 is result1 - - location = self.get_test_loc("matched_text/tokenize_matched_text_query.txt") - query_string = None - result3 = tokenize_matched_text(location, query_string, dictionary) - assert result3 is not result2 - assert result3 == result2 - - result4 = tokenize_matched_text(location, query_string, dictionary) - assert result4 is result3 - - def test_tokenize_matched_text_does_return_correct_tokens(self): - querys = """ - foobar 45 Copyright 2003 (C) James. All Rights Reserved. THIS - IS FROM THE CODEHAUS AND CONTRIBUTORS - """ - dictionary = dict( - this=0, event=1, possibility=2, reserved=3, liable=5, copyright=6 - ) - result = tokenize_matched_text( - location=None, query_string=querys, dictionary=dictionary - ) - expected = [ - Token( - value="\n", - line_num=1, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="foobar", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="45", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Copyright", - line_num=2, - pos=0, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="2003", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" (", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="C", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=") ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="James", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="All", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Rights", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Reserved", - line_num=2, - pos=1, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="THIS", - line_num=2, - pos=2, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value="\n", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="IS", - line_num=3, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="FROM", - line_num=3, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="THE", - line_num=3, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="CODEHAUS", - line_num=3, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="AND", - line_num=3, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="CONTRIBUTORS", - line_num=3, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value="\n", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value=" \n", - line_num=4, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - ] - - assert result == expected - - def test_tokenize_matched_text_does_not_crash_on_turkish_unicode(self): - querys = "İrəli" - result = tokenize_matched_text( - location=None, query_string=querys, dictionary={} - ) - - expected = [ - Token( - value="i", - line_num=1, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value="rəli", - line_num=1, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value="\n", - line_num=1, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - ] - assert result == expected - - def test_tokenize_matched_text_behaves_like_query_tokenizer_on_turkish_unicode( - self, - ): - from licensedcode.tokenize import query_tokenizer - - querys = "İrəli" - matched_text_result = tokenize_matched_text( - location=None, query_string=querys, dictionary={} - ) - matched_text_result = [t.value for t in matched_text_result] - query_tokenizer_result = list(query_tokenizer(querys)) - - if matched_text_result[-1] == "\n": - matched_text_result = matched_text_result[:-1] - - assert matched_text_result == query_tokenizer_result - - def test_reportable_tokens_filter_tokens_does_not_strip_last_token_value(self): - tokens = [ - Token( - value="\n", - line_num=1, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="foobar", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="45", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Copyright", - line_num=2, - pos=0, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="2003", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" (", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="C", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=") ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="James", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="All", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Rights", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Reserved", - line_num=2, - pos=1, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="THIS", - line_num=2, - pos=2, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value="\n", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=3, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - ] - - match_qspan = Span(0, 1) - result = list( - reportable_tokens( - tokens, match_qspan, start_line=1, end_line=2, whole_lines=False - ) - ) - expected = [ - Token( - value="Copyright", - line_num=2, - pos=0, - is_text=True, - is_matched=True, - is_known=True, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="2003", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" (", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="C", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=") ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="James", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="All", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Rights", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Reserved", - line_num=2, - pos=1, - is_text=True, - is_matched=True, - is_known=True, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - ] - - assert result == expected - - # test again with whole lines - match_qspan = Span(0, 1) - result = list( - reportable_tokens( - tokens, match_qspan, start_line=1, end_line=2, whole_lines=True - ) - ) - expected = [ - Token( - value="\n", - line_num=1, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="foobar", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="45", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Copyright", - line_num=2, - pos=0, - is_text=True, - is_matched=True, - is_known=True, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="2003", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" (", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="C", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=") ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="James", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="All", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Rights", - line_num=2, - pos=-1, - is_text=True, - is_matched=False, - is_known=False, - ), - Token( - value=" ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="Reserved", - line_num=2, - pos=1, - is_text=True, - is_matched=True, - is_known=True, - ), - Token( - value=". ", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - Token( - value="THIS", - line_num=2, - pos=2, - is_text=True, - is_matched=False, - is_known=True, - ), - Token( - value="\n", - line_num=2, - pos=-1, - is_text=False, - is_matched=False, - is_known=False, - ), - ] - - assert result == expected - - def test_matched_text_is_collected_correctly_end2end(self): - rules_data_dir = self.get_test_loc("matched_text/index/rules") - query_location = self.get_test_loc("matched_text/query.txt") - rules = models.load_rules(rules_data_dir) - idx = index.LicenseIndex(rules) - - results = [ - match.matched_text(_usecache=False) - for match in idx.match(location=query_location) - ] - expected = [ - "This source code is licensed under both the Apache 2.0 license " - "(found in the\n# LICENSE", - "This source code is licensed under [both] [the] [Apache] [2].[0] license " - "(found in the\n# LICENSE file in the root directory of this source tree)", - "GPLv2 (", - ] - assert results == expected - - def check_matched_texts(self, test_loc, expected_texts, whole_lines=True): - idx = cache.get_index() - test_loc = self.get_test_loc(test_loc) - matches = idx.match(location=test_loc) - matched_texts = [ - m.matched_text(whole_lines=whole_lines, highlight=False, _usecache=False) - for m in matches - ] - assert matched_texts == expected_texts - - def test_matched_text_is_collected_correctly_end2end_for_spdx_match_whole_lines( - self, - ): - self.check_matched_texts( - test_loc="matched_text/spdx/query.txt", - expected_texts=["@REM # SPDX-License-Identifier: BSD-2-Clause-Patent"], - whole_lines=True, - ) - - def test_matched_text_is_collected_correctly_end2end_for_spdx_match_plain(self): - self.check_matched_texts( - test_loc="matched_text/spdx/query.txt", - expected_texts=["SPDX-License-Identifier: BSD-2-Clause-Patent"], - whole_lines=False, - ) - - def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query( - self, - ): - idx = cache.get_index() - querys_with_diacritic_unicode = "İ license MIT" - result = idx.match(query_string=querys_with_diacritic_unicode) - assert len(result) == 1 - match = result[0] - expected = "license MIT" - matched_text = match.matched_text( - _usecache=False, - ) - assert matched_text == expected - - def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_file(self): - idx = cache.get_index() - file_with_diacritic_unicode_location = self.get_test_loc( - "matched_text/unicode_text/main3.js" - ) - result = idx.match(location=file_with_diacritic_unicode_location) - assert len(result) == 1 - match = result[0] - expected = "license MIT" - matched_text = match.matched_text(_usecache=False) - assert matched_text == expected - - def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query_whole_lines( - self, - ): - idx = cache.get_index() - querys_with_diacritic_unicode = "İ license MIT" - result = idx.match(query_string=querys_with_diacritic_unicode) - assert len(result) == 1 - match = result[0] - expected = "[İ] license MIT" - matched_text = match.matched_text(_usecache=False, whole_lines=True) - assert matched_text == expected - - def test_matched_text_is_not_truncated_with_unicode_diacritic_input_with_diacritic_in_rules( - self, - ): - rule_dir = self.get_test_loc("matched_text/turkish_unicode/rules") - idx = index.LicenseIndex(models.load_rules(rule_dir)) - query_loc = self.get_test_loc("matched_text/turkish_unicode/query") - matches = idx.match(location=query_loc) - matched_texts = [ - m.matched_text(whole_lines=False, highlight=False, _usecache=False) - for m in matches - ] - - expected = [ - "Licensed under the Apache License, Version 2.0\r\nnext_label=irəli", - "İ license MIT", - "İ license MIT", - "Licensed under the Apache License, Version 2.0\r\nnext_label=irəli", - "lİcense mit", - ] - - assert matched_texts == expected - - def test_matched_text_is_not_truncated_with_unicode_diacritic_input_and_full_index( - self, - ): - expected = [ - "Licensed under the Apache License, Version 2.0", - "license MIT", - "license MIT", - "Licensed under the Apache License, Version 2.0", - ] - - self.check_matched_texts( - test_loc="matched_text/turkish_unicode/query", - expected_texts=expected, - whole_lines=False, - ) - - def test_matched_text_does_not_ignores_whole_lines_in_binary_with_small_index(self): - rule_dir = self.get_test_loc("matched_text/binary_text/rules") - idx = index.LicenseIndex(models.load_rules(rule_dir)) - query_loc = self.get_test_loc("matched_text/binary_text/gosu") - matches = idx.match(location=query_loc) - matched_texts = [ - m.matched_text(whole_lines=True, highlight=False, _usecache=False) - for m in matches - ] - - expected = [ - "{{ .Self }} license: GPL-3 (full text at https://github.com/tianon/gosu)" - ] - - assert matched_texts == expected - - def test_matched_text_does_not_ignores_whole_lines_in_binary_against_full_index( - self, - ): - expected = [ - "{{ .Self }} license: GPL-3 (full text at https://github.com/tianon/gosu)" - ] - self.check_matched_texts( - test_loc="matched_text/binary_text/gosu", - expected_texts=expected, - whole_lines=True, - ) - - def test_matched_text_is_collected_correctly_in_binary_ffmpeg_windows_whole_lines( - self, - ): - expected_texts = [ - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "%sconfiguration: --enable-gpl --enable-version3 --enable-dxva2 " - "--enable-libmfx --enable-nvenc --enable-avisynth --enable-bzlib " - "--enable-fontconfig --enable-frei0r --enable-gnutls --enable-iconv " - "--enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca " - "--enable-libfreetype --enable-libgme --enable-libgsm --enable-libilbc " - "--enable-libmodplug --enable-libmp3lame --enable-libopencore-amrnb " - "--enable-libopencore-amrwb --enable-libopenh264 --enable-libopenjpeg " - "--enable-libopus --enable-librtmp --enable-libsnappy --enable-libsoxr " - "--enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab " - "--enable-libvo-amrwbenc --enable-libvorbis --enable-libvpx " - "--enable-libwavpack --enable-libwebp --enable-libx264 --enable-libx265 " - "--enable-libxavs --enable-libxvid --enable-libzimg --enable-lzma " - "--enable-decklink --enable-zlib", - "%s is free software; you can redistribute it and/or modify\n" - "it under the terms of the GNU General Public License as published by\n" - "the Free Software Foundation; either version 3 of the License, or\n" - "(at your option) any later version.\n" - "%s is distributed in the hope that it will be useful,\n" - "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" - "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" - "GNU General Public License for more details.\n" - "You should have received a copy of the GNU General Public License\n" - "along with %s. If not, see .\n" - "File formats:\n" - "D. = Demuxing supported\n" - ".E = Muxing supported\n" - "%s%s %-15s %s\n" - "Devices:\n" - "Codecs:\n" - "D..... = Decoding supported\n" - ".E.... = Encoding supported\n" - "..V... = Video codec\n" - "No option name near '%s'\n" - "Unable to parse '%s': %s\n" - "Setting '%s' to value '%s'\n" - "Option '%s' not found\n" - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libavfilter license: GPL version 3 or later", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libavformat license: GPL version 3 or later", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libavcodec license: GPL version 3 or later", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libpostproc license: GPL version 3 or later", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libswresample license: GPL version 3 or later", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libswscale license: GPL version 3 or later", - "--enable-gpl --enable-version3 --enable-dxva2 --enable-libmfx --enable-nvenc " - "--enable-avisynth --enable-bzlib --enable-fontconfig --enable-frei0r " - "--enable-gnutls --enable-iconv --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme " - "--enable-libgsm --enable-libilbc --enable-libmodplug --enable-libmp3lame " - "--enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenh264 " - "--enable-libopenjpeg --enable-libopus --enable-librtmp --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame " - "--enable-libvidstab --enable-libvo-amrwbenc --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 " - "--enable-libx265 --enable-libxavs --enable-libxvid --enable-libzimg " - "--enable-lzma --enable-decklink --enable-zlib", - "libavutil license: GPL version 3 or later", - "This software is derived from the GNU GPL XviD codec (1.3.0).", - ] - - self.check_matched_texts( - test_loc="matched_text/ffmpeg/ffmpeg.exe", - expected_texts=expected_texts, - whole_lines=True, - ) - - def test_matched_text_is_collected_correctly_in_binary_ffmpeg_windows_not_whole_lines( - self, - ): - expected_texts = [ - "enable-gpl --enable-version3 --", - "enable-gpl --enable-version3 --", - "is free software; you can redistribute it and/or modify\n" - "it under the terms of the GNU General Public License as published by\n" - "the Free Software Foundation; either version 3 of the License, or\n" - "(at your option) any later version.\n" - "%s is distributed in the hope that it will be useful,\n" - "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" - "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" - "GNU General Public License for more details.\n" - "You should have received a copy of the GNU General Public License\n" - "along with %s. If not, see .\n" - "File formats:\n" - "D. = Demuxing supported\n" - ".E = Muxing supported\n" - "%s%s %-15s %s\n" - "Devices:\n" - "Codecs:\n" - "D..... = Decoding supported\n" - ".E.... = Encoding supported\n" - "..V... = Video codec\n" - "No option name near '%s'\n" - "Unable to parse '%s': %s\n" - "Setting '%s' to value '%s'\n" - "Option '%s' not found\n" - "--enable-gpl --", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "enable-gpl --enable-version3 --", - "license: GPL version 3 or later", - "This software is derived from the GNU GPL XviD codec (", - ] - - self.check_matched_texts( - test_loc="matched_text/ffmpeg/ffmpeg.exe", - expected_texts=expected_texts, - whole_lines=False, - ) - - def test_matched_text_is_collected_correctly_in_binary_ffmpeg_elf_whole_lines(self): - expected_texts = [ - "--prefix=/usr --extra-version=0ubuntu0.1 --build-suffix=-ffmpeg " - "--toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu " - "--incdir=/usr/include/x86_64-linux-gnu --cc=cc --cxx=g++ --enable-gpl " - "--enable-shared --disable-stripping --disable-decoder=libopenjpeg " - "--disable-decoder=libschroedinger --enable-avresample --enable-avisynth " - "--enable-gnutls --enable-ladspa --enable-libass --enable-libbluray " - "--enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite " - "--enable-libfontconfig --enable-libfreetype --enable-libfribidi " - "--enable-libgme --enable-libgsm --enable-libmodplug --enable-libmp3lame " - "--enable-libopenjpeg --enable-libopus --enable-libpulse --enable-librtmp " - "--enable-libschroedinger --enable-libshine --enable-libsnappy " - "--enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora " - "--enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack " - "--enable-libwebp --enable-libx265 --enable-libxvid --enable-libzvbi " - "--enable-openal --enable-opengl --enable-x11grab --enable-libdc1394 " - "--enable-libiec61883 --enable-libzmq --enable-frei0r --enable-libx264 " - "--enable-libopencv", - "%sconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 " - "--build-suffix=-ffmpeg --toolchain=hardened " - "--libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu " - "--cc=cc --cxx=g++ --enable-gpl --enable-shared --disable-stripping " - "--disable-decoder=libopenjpeg --disable-decoder=libschroedinger " - "--enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa " - "--enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca " - "--enable-libcdio --enable-libflite --enable-libfontconfig " - "--enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm " - "--enable-libmodplug --enable-libmp3lame --enable-libopenjpeg " - "--enable-libopus --enable-libpulse --enable-librtmp --enable-libschroedinger " - "--enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex " - "--enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis " - "--enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 " - "--enable-libxvid --enable-libzvbi --enable-openal --enable-opengl " - "--enable-x11grab --enable-libdc1394 --enable-libiec61883 --enable-libzmq " - "--enable-frei0r --enable-libx264 --enable-libopencv", - "%s is free software; you can redistribute it and/or modify\n" - "it under the terms of the GNU General Public License as published by\n" - "the Free Software Foundation; either version 2 of the License, or\n" - "(at your option) any later version.\n" - "%s is distributed in the hope that it will be useful,\n" - "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" - "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" - "GNU General Public License for more details.\n" - "You should have received a copy of the GNU General Public License\n" - "along with %s; if not, write to the Free Software\n" - "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA", - ] - - self.check_matched_texts( - test_loc="matched_text/ffmpeg/ffmpeg", - expected_texts=expected_texts, - whole_lines=True, - ) - - def test_matched_text_is_collected_correctly_in_binary_ffmpeg_static_whole_lines( - self, - ): - expected_texts = ["libswresample license: LGPL version 2.1 or later"] - self.check_matched_texts( - test_loc="matched_text/ffmpeg/libavsample.lib", - expected_texts=expected_texts, - whole_lines=True, - ) diff --git a/scantext/tests/test_views.py b/scantext/tests/test_views.py index 8b4fe938c..a716fa89f 100644 --- a/scantext/tests/test_views.py +++ b/scantext/tests/test_views.py @@ -19,6 +19,10 @@ TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") +SCANCODE_BASE_URL = ( + "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data" +) + class TestScantextViews(FileBasedTesting): test_data_dir = TEST_DATA_DIR @@ -67,6 +71,6 @@ def test_get_rule_text_url__with_default_base_url(self): rule1 = models.Rule(license_expression="apache-2.0 or mit", stored_text="1") rule1.identifier = "apache-2.0_or_mit_48.RULE" result = get_rule_text_url(rule=rule1) - expected = "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-2.0_or_mit_48.RULE" + expected = SCANCODE_BASE_URL + "/rules/apache-2.0_or_mit_48.RULE" assert result == expected diff --git a/scantext/views.py b/scantext/views.py index 265ac9167..7b8ef1cec 100644 --- a/scantext/views.py +++ b/scantext/views.py @@ -208,8 +208,8 @@ def get_licenses( def get_percentage_of_license_text(query, matches): """ - Return percentage of license text matched in ``query`` Query by a list of ``matches`` - percentage is a float between 0 and 100. + Return percentage of license text matched in ``query`` Query by + a list of ``matches`` percentage is a float between 0 and 100. """ # TODO: percentage of license text should be done by scancode-toolkit. @@ -274,7 +274,9 @@ def build_colors(matches_by_id): .matched3 {background-color: rgba(220, 90, 30, 0.3);} """ return [ - f".matched{mid} {{background-color: rgba({(244 * (mid+1)) % 255}, {(234 * (mid+1)) % 255}, {(130 * (mid+1)) % 255}, 0.3);}}" + f""".matched{mid} {{background-color: rgba( + {(244 * (mid+1)) % 255}, {(234 * (mid+1)) % 255}, {(130 * (mid+1)) % 255}, + 0.3);}}""" for mid in matches_by_id ] From 57a1d62075d4d2c43fdfbdc5c22a8f58f55b3fb5 Mon Sep 17 00:00:00 2001 From: Akhil Raj Date: Sun, 18 Sep 2022 16:58:56 +0530 Subject: [PATCH 48/59] Indent html, handle input, fix modules & tests#450 * Indent html code to 2 spaces * Create temp directories for scanning * Move all non-views to `match_text.py` * Correct existing tests to match the rest of the app * Remove debug code Signed-off-by: Akhil Raj --- scantext/match_text.py | 326 ++++++++++++++---- .../includes/license_detail_modal.html | 98 +++--- .../scantext/includes/license_report.html | 4 +- .../includes/license_summary_detail.html | 123 +++---- .../includes/license_summary_header.html | 42 +-- .../templates/scantext/license_scan_form.html | 64 ++-- .../templates/scantext/license_summary.html | 59 ++-- scantext/tests/test_views.py | 47 +-- scantext/views.py | 290 +--------------- 9 files changed, 490 insertions(+), 563 deletions(-) diff --git a/scantext/match_text.py b/scantext/match_text.py index 9e5db2dff..615fe3fb6 100644 --- a/scantext/match_text.py +++ b/scantext/match_text.py @@ -1,62 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 # -# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. # ScanCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/scancode-toolkit for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. # +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. import attr +from licensedcode import models from licensedcode import query from licensedcode.spans import Span from licensedcode.stopwords import STOPWORDS from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import matched_query_text_tokenizer -TRACE = False -TRACE_MATCHED_TEXT = False -TRACE_MATCHED_TEXT_DETAILS = False - - -def logger_debug(*args): - pass - - -if TRACE or TRACE_MATCHED_TEXT or TRACE_MATCHED_TEXT_DETAILS: - - use_print = True - if use_print: - prn = print - else: - import logging - import sys - - logger = logging.getLogger(__name__) - # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) - logging.basicConfig(stream=sys.stdout) - logger.setLevel(logging.DEBUG) - prn = logger.debug - - def logger_debug(*args): - return prn(" ".join(isinstance(a, str) and a or repr(a) for a in args)) - - def _debug_print_matched_query_text(match, extras=5): - """ - Print a matched query text including `extras` tokens before and after - the match. Used for debugging license matches. - """ - # Create a fake new match with extra tokens before and after - new_match = match.combine(match) - new_qstart = max([0, match.qstart - extras]) - new_qend = min([match.qend + extras, len(match.query.tokens)]) - new_qspan = Span(new_qstart, new_qend) - new_match.qspan = new_qspan - - logger_debug(new_match) - logger_debug(" MATCHED QUERY TEXT with extras") - qt = new_match.matched_text(whole_lines=False) - logger_debug(qt) +SCANCODE_BASE_URL = ( + "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data" +) +SPDX_LICENSE_URL = "https://spdx.org/licenses/{}" +SCANCODE_LICENSEDB_URL = "https://scancode-licensedb.aboutcode.org/{}" @attr.s(slots=True) @@ -103,13 +79,168 @@ class Token: match_ids = attr.ib(attr.Factory(list)) -def tokenize_matched_text( - location, - query_string, - dictionary, - start_line=1, - trace=TRACE_MATCHED_TEXT_DETAILS, -): +def get_match_details(mid, match, license_url_template, spdx_license_url): + """ + Return a mapping of license data built from a LicenseMatch ``match``. + """ + from licensedcode import cache + + licenses = cache.get_licenses_db() + + # TODO: decide whether the text should be highlighted or not. + matched_text = match.matched_text(whole_lines=False, highlight=False) + + SCANCODE_LICENSE_TEXT_URL = SCANCODE_BASE_URL + "/{}.LICENSE" + SCANCODE_LICENSE_DATA_URL = SCANCODE_BASE_URL + "/{}.yml" + + result = {} + + result["mid"] = mid + # Detection Level Information + result["score"] = int(match.score()) + result["start_line"] = match.start_line + result["end_line"] = match.end_line + result["matched_length"] = match.len() + result["match_coverage"] = match.coverage() + result["matcher"] = match.matcher + + # LicenseDB Level Information (Rule that was matched) + result["license_expression"] = match.rule.license_expression + result["rule_text_url"] = get_rule_text_url(match.rule) + result["rule_identifier"] = match.rule.identifier + result["referenced_filenames"] = match.rule.referenced_filenames + result["is_license_text"] = match.rule.is_license_text + result["is_license_notice"] = match.rule.is_license_notice + result["is_license_reference"] = match.rule.is_license_reference + result["is_license_tag"] = match.rule.is_license_tag + result["is_license_intro"] = match.rule.is_license_intro + result["rule_length"] = match.rule.length + result["rule_relevance"] = match.rule.relevance + result["matched_text"] = matched_text + + # License Level Information (Individual licenses that this rule refers to) + result["licenses"] = detected_licenses = [] + for license_key in match.rule.license_keys(): + detected_license = {} + detected_licenses.append(detected_license) + + lic = licenses.get(license_key) + + detected_license["key"] = lic.key + detected_license["name"] = lic.name + detected_license["short_name"] = lic.short_name + detected_license["category"] = lic.category + detected_license["is_exception"] = lic.is_exception + detected_license["is_unknown"] = lic.is_unknown + detected_license["owner"] = lic.owner + detected_license["homepage_url"] = lic.homepage_url + detected_license["text_url"] = lic.text_urls[0] if lic.text_urls else "" + detected_license["reference_url"] = license_url_template.format(lic.key) + detected_license["scancode_text_url"] = SCANCODE_LICENSE_TEXT_URL.format( + lic.key + ) + detected_license["scancode_data_url"] = SCANCODE_LICENSE_DATA_URL.format( + lic.key + ) + + spdx_key = lic.spdx_license_key + detected_license["spdx_license_key"] = spdx_key + + if spdx_key: + is_license_ref = spdx_key.lower().startswith("licenseref-") + if is_license_ref: + spdx_url = SCANCODE_LICENSE_TEXT_URL.format(lic.key) + else: + # TODO: Is this replacing spdx_key??? + spdx_key = lic.spdx_license_key.rstrip("+") + spdx_url = spdx_license_url.format(spdx_key) + else: + spdx_url = "" + detected_license["spdx_url"] = spdx_url + + return result + + +def get_licenses(location, license_url_template=SCANCODE_LICENSEDB_URL, **kwargs): + """ + Return a mapping of license match data from detecting license + in the file at ``location`` suitable for use in template. + + The mapping can be empty if there are no matches. + """ + from licensedcode.cache import get_index + + idx = get_index() + + # gets matches from a license file + matches = idx.match( + location=location, + unknown_licenses=True, + **kwargs, + ) + + if not matches: + return {} + + query = matches[0].query + + # Assign a numeric id to every match. + matches_by_id = dict(enumerate(matches)) + + del matches + + license_matches = [] + + for mid, match in matches_by_id.items(): + license_matches.append( + get_match_details( + mid=mid, + match=match, + license_url_template=license_url_template, + spdx_license_url=SPDX_LICENSE_URL, + ) + ) + + license_tokens = get_license_tokens( + query=query, + matches_by_id=matches_by_id, + stopwords=STOPWORDS, + ) + + match_colors = get_build_colors(matches_by_id=matches_by_id) + + return { + "license_matches": license_matches, + "license_tokens": license_tokens, + "match_colors": match_colors, + "license_keys_count": get_license_keys_count(matches=matches_by_id.values()), + "percentage_of_license_text": get_percentage_of_license_text( + query=query, matches=matches_by_id.values() + ), + } + + +def get_license_tokens(query, matches_by_id, stopwords=STOPWORDS): + """ + Return a list of tokens from the list of ``matches`` in ``query``. + """ + # Token(value="", pos=3, is_text=True, is_matched=True, match_ids=[mid, mid, mid]) + tokens = list( + tokenize_matched_text( + location=query.location, + query_string=query.query_string, + dictionary=query.idx.dictionary, + start_line=query.start_line, + ) + ) + + for mid, match in matches_by_id.items(): + tag_matched_tokens(tokens=tokens, match_qspan=match.qspan, mid=mid) + + return tokens + + +def tokenize_matched_text(location, query_string, dictionary, start_line=1): """ Yield Token objects with pos and line number collected from the file at `location` or the `query_string` string. `dictionary` is the index mapping @@ -123,15 +254,7 @@ def tokenize_matched_text( start_line=start_line, ) for line_num, line in qls: - if trace: - logger_debug( - " tokenize_matched_text:", "line_num:", line_num, "line:", line - ) - for is_text, token_str in matched_query_text_tokenizer(line): - if trace: - logger_debug(" is_text:", is_text, "token_str:", repr(token_str)) - # Determine if a token is is_known in the license index or not. This # is essential as we need to realign the query-time tokenization # with the full text to report proper matches. @@ -196,3 +319,80 @@ def tokenize_matched_text( is_known=False, pos=-1, ) + + +def tag_matched_tokens(tokens, match_qspan, mid): + """ + Tag an iterable of ``tokens`` tagging each token with ``mid`` match id + if matched meaning the token is in the ``match_qspan``. + """ + previous_is_matched = False + for tok in tokens: + if previous_is_matched and not tok.is_text: + tok.match_ids.append(mid) + tok = attr.evolve(tok, is_matched=True) + previous_is_matched = False + elif tok.pos != -1 and tok.is_known and tok.pos in match_qspan: + tok.match_ids.append(mid) + tok = attr.evolve(tok, is_matched=True) + previous_is_matched = True + + +def get_build_colors(matches_by_id): + """ + Return a mapping of mid to css color code. + + .matched1 {background-color: rgba(30, 220, 90, 0.3);} + .matched2 {background-color: rgba(30, 90, 220, 0.3);} + .matched3 {background-color: rgba(220, 90, 30, 0.3);} + """ + return [ + f""".matched{mid} {{background-color: rgba( + {(244 * (mid+1)) % 255}, {(234 * (mid+1)) % 255}, {(130 * (mid+1)) % 255}, + 0.3);}}""" + for mid in matches_by_id + ] + + +def get_percentage_of_license_text(query, matches): + """ + Return percentage of license text matched in ``query`` Query by + a list of ``matches`` percentage is a float between 0 and 100. + """ + + # TODO: percentage of license text should be done by scancode-toolkit. + if not matches: + return 0 + + qspans = (match.qspan for match in matches) + + matched_tokens_length = len(Span().union(*qspans)) + query_tokens_length = query.tokens_length(with_unknown=True) + return round((matched_tokens_length / query_tokens_length) * 100, 2) + + +def get_rule_text_url(rule, base_url=SCANCODE_BASE_URL): + """ + Return a URL to the text file of a ``rule`` Rule. + Return None if there is no URL for the ``rule``. + """ + + if isinstance(rule, (models.SpdxRule, models.UnknownRule)): + return + + if rule.is_from_license: + return f"{base_url}/licenses/{rule.identifier}" + + else: + return f"{base_url}/rules/{rule.identifier}" + + +def get_license_keys_count(matches): + """ + Return the number of unique license keys found in a list of license matches. + """ + keys = set() + for match in matches: + keys.update(match.rule.license_keys()) + + return len(keys) diff --git a/scantext/templates/scantext/includes/license_detail_modal.html b/scantext/templates/scantext/includes/license_detail_modal.html index 5a269fcd4..b85171ce5 100644 --- a/scantext/templates/scantext/includes/license_detail_modal.html +++ b/scantext/templates/scantext/includes/license_detail_modal.html @@ -7,55 +7,55 @@