diff --git a/AUTHORS.rst b/AUTHORS.rst index f7f748a8c92..3bd18d111d8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -34,6 +34,7 @@ The following organizations or individuals have contributed to ScanCode: - Jelmer Vernooij @jelmer - Jillian Daguil @jdaguil - Jiri Popelka @jpopelka +- Jiyeong Seok @dd-jy - Johannes Najjar @joshovi - Jose Nazario @paralax - Lemo Shi @lemoshi @@ -43,6 +44,7 @@ The following organizations or individuals have contributed to ScanCode: - Maximilian Huber @maxhbr - Michael Herzog @mjherzog - Michael Rupprecht @michaelrup +- Mike Rombout @mrombout - Mrinal Paliwal @mnpw - nexB Inc. @nexB - Nirmal Sarswat @vivonk diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 497cb2baede..27fe05902d9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -49,6 +49,9 @@ License detection: - XXXX new license detection rules have been added, and - XXXX existing license rules have been updated. +- Key phrases can now be defined in RULEs by surrounding one or more words with + `{{` and `}}`. When defined a RULE will only match when the key phrases match + exactly. Package detection: ~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/how-to-guides/add_new_license_detection_rule.rst b/docs/source/how-to-guides/add_new_license_detection_rule.rst index 239a5bc9c70..ddc06ca9a0b 100644 --- a/docs/source/how-to-guides/add_new_license_detection_rule.rst +++ b/docs/source/how-to-guides/add_new_license_detection_rule.rst @@ -62,3 +62,7 @@ More (advanced) rules options: - Each rules needs have one flag such as is_license_notice. See the ``src/licensedcode/models.py`` directory for a list of all possible values and other options. + +- you can specify key phrases by surrounding one or more words between the `{{` + and `}}` tags. Key phrases are words that **must** be matched/present in order + for a RULE to be considered a match. diff --git a/src/licensedcode/data/rules/apache-2.0_1026.RULE b/src/licensedcode/data/rules/apache-2.0_1026.RULE new file mode 100644 index 00000000000..58bca330e1c --- /dev/null +++ b/src/licensedcode/data/rules/apache-2.0_1026.RULE @@ -0,0 +1 @@ +This SDK is distributed under the Apache License, Version 2.0, see LICENSE for more information. \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_1026.yml b/src/licensedcode/data/rules/apache-2.0_1026.yml new file mode 100644 index 00000000000..fec9233bc17 --- /dev/null +++ b/src/licensedcode/data/rules/apache-2.0_1026.yml @@ -0,0 +1,3 @@ +license_expression: apache-2.0 +is_license_notice: yes +relevance: 100 diff --git a/src/licensedcode/data/rules/apache-2.0_571.RULE b/src/licensedcode/data/rules/apache-2.0_571.RULE index fa3c6792f47..71781a4845d 100644 --- a/src/licensedcode/data/rules/apache-2.0_571.RULE +++ b/src/licensedcode/data/rules/apache-2.0_571.RULE @@ -1,6 +1,6 @@ - Apache License 2.0 + {{Apache License 2.0}} http://repository.jboss.org/licenses/apache-2.0.txt repo diff --git a/src/licensedcode/data/rules/apache-2.0_and_cc-by-sa-4.0_2.RULE b/src/licensedcode/data/rules/apache-2.0_and_cc-by-sa-4.0_2.RULE index 27d5664d01a..414362cedc8 100644 --- a/src/licensedcode/data/rules/apache-2.0_and_cc-by-sa-4.0_2.RULE +++ b/src/licensedcode/data/rules/apache-2.0_and_cc-by-sa-4.0_2.RULE @@ -1 +1 @@ -Code is released under the [Apache 2.0 license](LICENSE). This `README.md` file and the [`CONTRIBUTING.md`](CONTRIBUTING.md) file are licensed under the Creative Commons Attribution 4.0 International License under the terms and conditions set forth in the file [`LICENSE.docs`](LICENSE.docs). You may obtain a duplicate copy of the same license, titled CC BY-SA 4.0, at http://creativecommons.org/licenses/by-sa/4.0/. \ No newline at end of file +Code is released under the [{{Apache 2.0}} license](LICENSE). This `README.md` file and the [`CONTRIBUTING.md`](CONTRIBUTING.md) file are licensed under the {{Creative Commons Attribution 4.0 International License}} under the terms and conditions set forth in the file [`LICENSE.docs`](LICENSE.docs). You may obtain a duplicate copy of the same license, titled CC BY-SA 4.0, at http://creativecommons.org/licenses/by-sa/4.0/. \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_1.RULE b/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_1.RULE index 8a2ac29e7d4..643fbc95194 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_1.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_1.RULE @@ -1,11 +1,11 @@ - The Apache Software License, Version 2.0 + {{The Apache Software License}}, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt repo - Eclipse Public License - v 2.0 + {{Eclipse Public License}} - v 2.0 http://www.eclipse.org/legal/epl-v20.html repo diff --git a/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_2.RULE b/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_2.RULE index 37ba4a52ee5..43fb89b4d30 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_2.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_epl-2.0_2.RULE @@ -1,11 +1,11 @@ - The Apache Software License, Version 2.0 + The {{Apache Software License, Version 2.0}} https://www.apache.org/licenses/LICENSE-2.0.txt repo - Eclipse Public License - v 2.0 + {{Eclipse Public License - v 2.0}} http://www.eclipse.org/legal/epl-v20.html repo diff --git a/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_3.RULE b/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_3.RULE index fc0cf910602..cf5115a56b7 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_3.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_3.RULE @@ -1 +1 @@ -The Apache Software License, Version 2.0http://www.apache.org/licenses/LICENSE-2.0.txtrepoGNU Lesser General Public License (LGPL), Version 2.1http://www.fsf.org/licensing/licenses/lgpl.txtrepo \ No newline at end of file +The {{Apache Software License}}, Version 2.0http://www.apache.org/licenses/LICENSE-2.0.txtrepo{{GNU Lesser General Public License}} (LGPL), Version 2.1http://www.fsf.org/licensing/licenses/lgpl.txtrepo \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_4.RULE b/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_4.RULE index 4a90a2345ad..d803aab426f 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_4.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_4.RULE @@ -1 +1 @@ -The Apache Software License, Version 2.0https://www.apache.org/licenses/LICENSE-2.0.txtrepoGNU Lesser General Public License (LGPL), Version 2.1http://www.fsf.org/licensing/licenses/lgpl.txtrepo \ No newline at end of file +The {{Apache Software License}}, Version 2.0https://www.apache.org/licenses/LICENSE-2.0.txtrepo{{GNU Lesser General Public License}} (LGPL), Version 2.1http://www.fsf.org/licensing/licenses/lgpl.txtrepo \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_5.RULE b/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_5.RULE index a439ac54c74..db6fb1b5876 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_5.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_gpl-1.0-plus_5.RULE @@ -1,2 +1,2 @@ -The Apache Software License, Version 2.0https://www.apache.org/licenses/LICENSE-2.0.txtrepoGNU Lesser General Public License (LGPL), Version 2.1http://www.fsf.org/licensing/licenses/lgpl.txtrepo \ No newline at end of file +The {{Apache Software License}}, Version 2.0https://www.apache.org/licenses/LICENSE-2.0.txtrepo{{GNU Lesser General Public License}} (LGPL), Version 2.1http://www.fsf.org/licensing/licenses/lgpl.txtrepo \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_10.RULE b/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_10.RULE index 365734e3f45..0620c6d9dc7 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_10.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_10.RULE @@ -1,16 +1,16 @@ - Apache License, Version 2.0 + {{Apache License, Version 2.0}} http://www.apache.org/licenses/LICENSE-2.0 repo - GNU General Public License (GPL) version 2, or any later version + {{GNU General Public License (GPL) version 2}}, or any later version https://www.gnu.org/licenses/ repo - GPLv2 with Classpath exception + {{GPLv2 with Classpath exception}} https://www.gnu.org/software/classpath/license.html repo diff --git a/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_7.RULE b/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_7.RULE index 521e6593476..6101351b0a4 100644 --- a/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_7.RULE +++ b/src/licensedcode/data/rules/apache-2.0_or_gpl-2.0-plus_with_classpath-exception-2.0_7.RULE @@ -1,16 +1,16 @@ - Apache License, Version 2.0 + {{Apache License}}, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0 repo - GNU General Public License (GPL) version 2, or any later version + {{GNU General Public License}} (GPL) version 2, or any later version https://www.gnu.org/licenses/ repo - GPLv2 with Classpath exception + {{GPLv2}} with {{Classpath exception}} https://www.gnu.org/software/classpath/license.html repo diff --git a/src/licensedcode/data/rules/apache-2.0_url_12.RULE b/src/licensedcode/data/rules/apache-2.0_url_12.RULE index ae60af9ef7f..be529762a34 100644 --- a/src/licensedcode/data/rules/apache-2.0_url_12.RULE +++ b/src/licensedcode/data/rules/apache-2.0_url_12.RULE @@ -1 +1 @@ - Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file + {{Apache License}}, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_url_13.RULE b/src/licensedcode/data/rules/apache-2.0_url_13.RULE index 27fe26f805f..81b27c9d51c 100644 --- a/src/licensedcode/data/rules/apache-2.0_url_13.RULE +++ b/src/licensedcode/data/rules/apache-2.0_url_13.RULE @@ -1 +1 @@ - Apache License, Version 2.0 https://apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file + {{Apache License}}, Version 2.0 https://apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_url_14.RULE b/src/licensedcode/data/rules/apache-2.0_url_14.RULE index afd6f2ab845..39a6bfa866b 100644 --- a/src/licensedcode/data/rules/apache-2.0_url_14.RULE +++ b/src/licensedcode/data/rules/apache-2.0_url_14.RULE @@ -1 +1 @@ - Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file + {{Apache License, Version 2.0}} http://www.apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file diff --git a/src/licensedcode/data/rules/apache-2.0_url_15.RULE b/src/licensedcode/data/rules/apache-2.0_url_15.RULE index c979cb75167..fb6b60a5b2c 100644 --- a/src/licensedcode/data/rules/apache-2.0_url_15.RULE +++ b/src/licensedcode/data/rules/apache-2.0_url_15.RULE @@ -1 +1 @@ - Apache License, Version 2.0 http://apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file + {{Apache License}}, Version 2.0 http://apache.org/licenses/LICENSE-2.0 repo \ No newline at end of file diff --git a/src/licensedcode/data/rules/bsd-new_531.RULE b/src/licensedcode/data/rules/bsd-new_531.RULE index f63a6c3259f..18f41e4e378 100644 --- a/src/licensedcode/data/rules/bsd-new_531.RULE +++ b/src/licensedcode/data/rules/bsd-new_531.RULE @@ -1,6 +1,6 @@ - Berkeley Software Distribution (BSD) License + {{Berkeley Software Distribution}} (BSD) License http://www.opensource.org/licenses/bsd-license.html repo diff --git a/src/licensedcode/data/rules/bsd-new_58.RULE b/src/licensedcode/data/rules/bsd-new_58.RULE index 305d08d1ea0..f841492b404 100644 --- a/src/licensedcode/data/rules/bsd-new_58.RULE +++ b/src/licensedcode/data/rules/bsd-new_58.RULE @@ -1,6 +1,6 @@ - Berkeley Software Distribution (BSD) License + {{Berkeley Software Distribution (BSD) License}} http://www.opensource.org/licenses/bsd-license.html diff --git a/src/licensedcode/data/rules/bsd-simplified_and_gpl-2.0_1.RULE b/src/licensedcode/data/rules/bsd-simplified_and_gpl-2.0_1.RULE index e5b1195518e..1bc4272e8a7 100644 --- a/src/licensedcode/data/rules/bsd-simplified_and_gpl-2.0_1.RULE +++ b/src/licensedcode/data/rules/bsd-simplified_and_gpl-2.0_1.RULE @@ -1,6 +1,6 @@ This repository uses 2 different licenses : -- all files in the `lib` directory use a BSD 2-Clause license -- all other files use a GPLv2 license, unless explicitly stated otherwise +- all files in the `lib` directory use a {{BSD 2-Clause}} license +- all other files use a {{GPLv2}} license, unless explicitly stated otherwise Relevant license is reminded at the top of each source file, and with presence of COPYING or LICENSE file in associated directories. \ No newline at end of file diff --git a/src/licensedcode/data/rules/cc-by-4.0_16.RULE b/src/licensedcode/data/rules/cc-by-4.0_16.RULE index 1a8d281d33a..3781b03661f 100644 --- a/src/licensedcode/data/rules/cc-by-4.0_16.RULE +++ b/src/licensedcode/data/rules/cc-by-4.0_16.RULE @@ -1,4 +1,4 @@ -under the Creative Commons Attribution 4.0 International License (the "License"); +under the {{Creative Commons Attribution 4.0 International License}} (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at diff --git a/src/licensedcode/data/rules/cc-by-nc-sa-4.0_24.RULE b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_24.RULE new file mode 100644 index 00000000000..cf4571da813 --- /dev/null +++ b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_24.RULE @@ -0,0 +1,7 @@ +This work is licensed under a {{Creative Commons Attribution-NonCommercial-ShareAlike 4.0}} +International License (the "License"). You may not use this file except in compliance with the +License. A copy of the License is located at http://creativecommons.org/licenses/by-nc-sa/4.0/. + +This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +either express or implied. See the License for the specific language governing permissions and +limitations under the License. diff --git a/src/licensedcode/data/rules/cc-by-nc-sa-4.0_24.yml b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_24.yml new file mode 100644 index 00000000000..431a78a757f --- /dev/null +++ b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_24.yml @@ -0,0 +1,5 @@ +license_expression: cc-by-nc-sa-4.0 +is_license_notice: yes +relevance: 100 +ignorable_urls: + - http://creativecommons.org/licenses/by-nc-sa/4.0 diff --git a/src/licensedcode/data/rules/cc-by-nc-sa-4.0_25.RULE b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_25.RULE new file mode 100644 index 00000000000..655fc9d5b4b --- /dev/null +++ b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_25.RULE @@ -0,0 +1 @@ +{{Creative Commons Attribution-NonCommercial-ShareAlike 4.0}} International License diff --git a/src/licensedcode/data/rules/cc-by-nc-sa-4.0_25.yml b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_25.yml new file mode 100644 index 00000000000..52fcc6b0e49 --- /dev/null +++ b/src/licensedcode/data/rules/cc-by-nc-sa-4.0_25.yml @@ -0,0 +1,3 @@ +license_expression: cc-by-nc-sa-4.0 +is_license_reference: yes +relevance: 100 diff --git a/src/licensedcode/data/rules/cddl-1.0_27.RULE b/src/licensedcode/data/rules/cddl-1.0_27.RULE index 23b56dd751e..706ebfa2038 100644 --- a/src/licensedcode/data/rules/cddl-1.0_27.RULE +++ b/src/licensedcode/data/rules/cddl-1.0_27.RULE @@ -1,6 +1,6 @@ -CDDL 1 +{{CDDL 1}} http://opensource.org/licenses/CDDL-1.0 repo diff --git a/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_4.RULE b/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_4.RULE index 86ad41f9581..62f743c1751 100644 --- a/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_4.RULE +++ b/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_4.RULE @@ -1,11 +1,11 @@ - Eclipse Public License 1.0 + {{Eclipse Public License 1.0}} http://www.eclipse.org/org/documents/epl-v10.php Manual - GNU Lesser GPL 2.1 + {{GNU Lesser GPL 2.1}} http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html manual diff --git a/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_5.RULE b/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_5.RULE index 7627028a972..631815f6629 100644 --- a/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_5.RULE +++ b/src/licensedcode/data/rules/epl-1.0_or_lgpl-2.1-plus_5.RULE @@ -1,11 +1,11 @@ - Eclipse Public License 1.0 + {{Eclipse Public License 1.0}} http://www.eclipse.org/org/documents/epl-v10.php Manual - GNU Lesser GPL 2.1 + {{GNU Lesser GPL 2.1}} https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html manual diff --git a/src/licensedcode/data/rules/false-positive_ansible_2.RULE b/src/licensedcode/data/rules/false-positive_ansible_2.RULE index 2b507d40c27..cb4c996eb75 100644 --- a/src/licensedcode/data/rules/false-positive_ansible_2.RULE +++ b/src/licensedcode/data/rules/false-positive_ansible_2.RULE @@ -5,4 +5,4 @@ # - GPLv3 # - Apache # - CC-BY -license: {{ license }} +license: license diff --git a/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.RULE b/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.RULE index 1ebc8646806..02fe842c6bf 100644 --- a/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.RULE +++ b/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.RULE @@ -1,4 +1,4 @@ -Distributed under the terms of the GNU General Public License (version 2 -# or later) with exception for distributing the bootloader. +Distributed under the terms of the {{GNU General Public License}} (version 2 +# or later) with exception for distributing the {{bootloader}}. # -# The full license is in the file COPYING.txt, distributed with this software. \ No newline at end of file +# The full license is in the file COPYING.txt, distributed with this software. diff --git a/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.yml b/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.yml index 9a7bcee6b5d..d05d7350205 100644 --- a/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.yml +++ b/src/licensedcode/data/rules/gpl-2.0-plus_and_bootloader-exception_1.yml @@ -3,3 +3,4 @@ is_license_notice: yes relevance: 100 referenced_filenames: - COPYING.txt +minimum_coverage: 70 diff --git a/src/licensedcode/data/rules/gpl-2.0_1044.RULE b/src/licensedcode/data/rules/gpl-2.0_1044.RULE index 8f9baadf555..c8a3f839fea 100644 --- a/src/licensedcode/data/rules/gpl-2.0_1044.RULE +++ b/src/licensedcode/data/rules/gpl-2.0_1044.RULE @@ -1,3 +1,3 @@ -Licensed under the GNU General Public License v2 (the "License"); +Licensed under the {{GNU General Public License v2}} (the "License"); you may not use this file except in compliance with the License. The terms of the License are located in the COPYING file of this distribution. \ No newline at end of file diff --git a/src/licensedcode/data/rules/gpl-2.0_644.RULE b/src/licensedcode/data/rules/gpl-2.0_644.RULE index 17087a59fba..512d99d548b 100644 --- a/src/licensedcode/data/rules/gpl-2.0_644.RULE +++ b/src/licensedcode/data/rules/gpl-2.0_644.RULE @@ -1,4 +1,4 @@ -#{{IS_RIGHT +#IS_RIGHT # This program is distributed under GPL Version 2.0 in the hope that # it will be useful, but WITHOUT ANY WARRANTY. -#}}IS_RIGHT +#IS_RIGHT diff --git a/src/licensedcode/data/rules/gpl-3.0_111.RULE b/src/licensedcode/data/rules/gpl-3.0_111.RULE index bb8003759fd..fdabb53d9f9 100644 --- a/src/licensedcode/data/rules/gpl-3.0_111.RULE +++ b/src/licensedcode/data/rules/gpl-3.0_111.RULE @@ -1,5 +1,5 @@ -{{IS_RIGHT +IS_RIGHT This program is distributed under GPL Version 3.0 in the hope that it will be useful, but WITHOUT ANY WARRANTY. -}}IS_RIGHT \ No newline at end of file +IS_RIGHT \ No newline at end of file diff --git a/src/licensedcode/data/rules/imagemagick_19.RULE b/src/licensedcode/data/rules/imagemagick_19.RULE index bcef682e3bc..3138f2a565c 100644 --- a/src/licensedcode/data/rules/imagemagick_19.RULE +++ b/src/licensedcode/data/rules/imagemagick_19.RULE @@ -45,7 +45,7 @@ You may add Your own copyright statement to Your modifications and may provide a How to Apply the License to your Work -To apply the ImageMagick License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information (don't include the brackets). The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. +To apply the {{ImageMagick}} License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information (don't include the brackets). The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] diff --git a/src/licensedcode/data/rules/lgpl-3.0-plus_216.RULE b/src/licensedcode/data/rules/lgpl-3.0-plus_216.RULE index d672f32ad1d..ebd9bd5ee67 100644 --- a/src/licensedcode/data/rules/lgpl-3.0-plus_216.RULE +++ b/src/licensedcode/data/rules/lgpl-3.0-plus_216.RULE @@ -1,4 +1,4 @@ -GNU Lesser General Public License, Version 3 {{{ */ +GNU Lesser General Public License, Version 3 */ is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the diff --git a/src/licensedcode/data/rules/lgpl-3.0-plus_27.RULE b/src/licensedcode/data/rules/lgpl-3.0-plus_27.RULE index 9703cd3302d..3a14c5f7770 100644 --- a/src/licensedcode/data/rules/lgpl-3.0-plus_27.RULE +++ b/src/licensedcode/data/rules/lgpl-3.0-plus_27.RULE @@ -1,4 +1,4 @@ -GNU Lesser General Public License, Version 3 {{{ */ +GNU Lesser General Public License, Version 3 */ is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the diff --git a/src/licensedcode/data/rules/mit_923.RULE b/src/licensedcode/data/rules/mit_923.RULE index 89138d8be90..656c2f061e0 100644 --- a/src/licensedcode/data/rules/mit_923.RULE +++ b/src/licensedcode/data/rules/mit_923.RULE @@ -1,2 +1,2 @@ License -Distributed under the MIT License. See LICENSE for more information. \ No newline at end of file +Distributed under the {{MIT}} License. See LICENSE for more information. \ No newline at end of file diff --git a/src/licensedcode/data/rules/mpl-1.0_14.RULE b/src/licensedcode/data/rules/mpl-1.0_14.RULE index ef4bfb501a0..c62b6c35836 100644 --- a/src/licensedcode/data/rules/mpl-1.0_14.RULE +++ b/src/licensedcode/data/rules/mpl-1.0_14.RULE @@ -1,6 +1,6 @@ MPL: -"The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ +"The contents of this file are subject to the {{Mozilla Public License}} Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License. diff --git a/src/licensedcode/data/rules/mpl-1.0_8.RULE b/src/licensedcode/data/rules/mpl-1.0_8.RULE index 070c8930174..4005eb481ff 100644 --- a/src/licensedcode/data/rules/mpl-1.0_8.RULE +++ b/src/licensedcode/data/rules/mpl-1.0_8.RULE @@ -1,4 +1,4 @@ - * The contents of this file are subject to the Mozilla Public License + * The contents of this file are subject to the {{Mozilla Public License}} * Version 1.0 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ diff --git a/src/licensedcode/data/rules/mpl-1.1_36.RULE b/src/licensedcode/data/rules/mpl-1.1_36.RULE index 34bdce9d12d..f8952211915 100644 --- a/src/licensedcode/data/rules/mpl-1.1_36.RULE +++ b/src/licensedcode/data/rules/mpl-1.1_36.RULE @@ -1,6 +1,6 @@ The MPL v1.1: - The contents of this file are subject to the Mozilla Public License + The contents of this file are subject to the {{Mozilla Public License}} Version 1.1 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www..com/mpl.html diff --git a/src/licensedcode/data/rules/mpl-1.1_40.RULE b/src/licensedcode/data/rules/mpl-1.1_40.RULE index a87e08fd30b..13ee36dce8d 100644 --- a/src/licensedcode/data/rules/mpl-1.1_40.RULE +++ b/src/licensedcode/data/rules/mpl-1.1_40.RULE @@ -1,4 +1,4 @@ -The contents of this file are subject to the Mozilla Public License Version 1.1 +The contents of this file are subject to the {{Mozilla Public License}} Version 1.1 (the License); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ Software distributed under the License is distributed on an AS IS basis, diff --git a/src/licensedcode/data/rules/mpl-1.1_6.RULE b/src/licensedcode/data/rules/mpl-1.1_6.RULE index fd7a96718e5..69266fa7a44 100644 --- a/src/licensedcode/data/rules/mpl-1.1_6.RULE +++ b/src/licensedcode/data/rules/mpl-1.1_6.RULE @@ -1,4 +1,4 @@ -%% The contents of this file are subject to the Mozilla Public License +%% The contents of this file are subject to the {{Mozilla Public License}} %% Version 1.1 (the "License"); you may not use this file except in %% compliance with the License. You may obtain a copy of the License at %% http://www.mozilla.org/MPL/ diff --git a/src/licensedcode/data/rules/npl-1.1_1.RULE b/src/licensedcode/data/rules/npl-1.1_1.RULE index d5bd2cdb076..5237c226ad8 100644 --- a/src/licensedcode/data/rules/npl-1.1_1.RULE +++ b/src/licensedcode/data/rules/npl-1.1_1.RULE @@ -1,5 +1,5 @@ -* The contents of this file are subject to the Netscape Public - * License Version 1.1 (the "License"); you may not use this file +* The contents of this file are subject to the {{Netscape Public + * License}} Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/NPL/ * diff --git a/src/licensedcode/data/rules/proprietary_133.RULE b/src/licensedcode/data/rules/proprietary_133.RULE index 63257a39569..f7e787947b7 100644 --- a/src/licensedcode/data/rules/proprietary_133.RULE +++ b/src/licensedcode/data/rules/proprietary_133.RULE @@ -1,4 +1,4 @@ - * The contents of this file are subject to the KnowledgeTree Public + * The contents of this file are subject to the {{KnowledgeTree}} Public * License Version 1.1 ("License"); You may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.ktdms.com/KPL diff --git a/src/licensedcode/data/rules/ruby_5.RULE b/src/licensedcode/data/rules/ruby_5.RULE index 42964004be8..1258effabfc 100644 --- a/src/licensedcode/data/rules/ruby_5.RULE +++ b/src/licensedcode/data/rules/ruby_5.RULE @@ -1 +1 @@ - This library is distributed under the terms of the Ruby license. \ No newline at end of file + This library is distributed under the terms of the {{Ruby}} license. \ No newline at end of file diff --git a/src/licensedcode/index.py b/src/licensedcode/index.py index 28217e10885..61dfcaff859 100644 --- a/src/licensedcode/index.py +++ b/src/licensedcode/index.py @@ -330,6 +330,9 @@ def _add_rules(self, rules, _legalese=common_license_words, _spdx_tokens=frozens # "weak" rules can only be matched with an automaton. is_weak = True + # note down any key phrase spans that must be present for the rule to pass through refinement + rule.key_phrase_spans = list(rule.key_phrases()) + for rts in rule.tokens(): rtid = dictionary_get(rts) if rtid is None: diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 7195289d672..3104ff3a6e3 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -35,6 +35,7 @@ TRACE_FILTER_LOW_SCORE = False TRACE_FILTER_UNKNOWN_WORDS = False TRACE_SET_LINES = False +TRACE_KEY_PHRASES = False TRACE_MATCHED_TEXT = False TRACE_MATCHED_TEXT_DETAILS = False @@ -385,7 +386,7 @@ def qcontains_stopwords(self): stopwords_pos = qspan & query.stopwords_span stopwords_pos = (pos for pos in stopwords_pos if pos != qspe) qry_stopxpos = query.stopwords_by_pos - return any(qry_stopxpos[pos] for pos in stopwords_pos) + return any(qry_stopxpos.get(pos, 0) for pos in stopwords_pos) def qrange(self): """ @@ -1216,8 +1217,12 @@ def filter_low_score(matches, min_score=100): return kept, discarded -def filter_spurious_single_token(matches, query=None, unknown_count=5, - trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN): +def filter_spurious_single_token( + matches, + query=None, + unknown_count=5, + trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN, +): """ Return a filtered list of kept LicenseMatch matches and a list of discardable matches given a `matches` list of LicenseMatch by removing @@ -1249,9 +1254,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5, qend = match.qend # compute the number of unknown tokens before and after this single - # matched position note: unknowns_by_pos is a defaultdict(int), - # shorts_and_digits is a set of integers - before = unknowns_by_pos[qstart - 1] + # matched position note: + # - unknowns_by_pos is a dict, + # - shorts_and_digits is a set of ints + before = unknowns_by_pos.get(qstart - 1, 0) for p in range(qstart - 1 - unknown_count, qstart): if p in shorts_and_digits: before += 1 @@ -1262,7 +1268,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5, kept.append(match) continue - after = unknowns_by_pos[qstart] + after = unknowns_by_pos.get(qstart, 0) for p in range(qend, qend + 1 + unknown_count): if p in shorts_and_digits: after += 1 @@ -1389,6 +1395,55 @@ def filter_already_matched_matches(matches, query): return kept, discarded +def filter_key_phrase_spans(matches): + """ + Return a filtered list of kept LicenseMatch matches and a list of + discardable matches by removing all matches that do not contain all key + phrases required by the rule. + """ + kept = [] + discarded = [] + + for match in matches: + has_key_phrases = True + unknown_by_pos = match.query.unknowns_by_pos + stopwords_by_pos = match.query.stopwords_by_pos + + for key_phrase_span in match.rule.key_phrase_spans: + # Filter out matches that do not contain key phrase in the ispan + if key_phrase_span not in match.ispan: + has_key_phrases = False + break + + # Filter out matches that do not contain key phrase in the qspan + qpos_start = next(qpos for qpos, ipos in zip(match.qspan, match.ispan) if ipos in key_phrase_span) + query_key_phrase_span = Span(qpos_start, qpos_start + len(key_phrase_span)) + if query_key_phrase_span not in match.qspan: + has_key_phrases = False + break + + # Filter out matches where key phrase in qspan is interrupted by + # unknown or stopwords. + # + # Do not check the last span position of a key phrase since unknown + # and stop is a number of words after a given span position and we + # would not care for what unknown words show up after a key phrase + # ends + key_phrase_spans_minus_last_position = Span(key_phrase_span.start, key_phrase_span.end - 1) + for qpos, ipos in zip(match.qspan, match.ispan): + if ipos in key_phrase_spans_minus_last_position: + if qpos in unknown_by_pos or qpos in stopwords_by_pos: + has_key_phrases = False + break + + if has_key_phrases: + kept.append(match) + else: + discarded.append(match) + + return kept, discarded + + def refine_matches( matches, idx, @@ -1443,6 +1498,10 @@ def _log(_matches, _discarded, msg): all_discarded.extend(discarded) _log(matches, discarded, 'GOOD') + matches, discarded = filter_key_phrase_spans(matches) + all_discarded.extend(discarded) + _log(matches, discarded, 'KEY PHRASES') + matches = merge_matches(matches) if TRACE: logger_debug(' #####refine_matches: before FILTER matches#', len(matches)) if TRACE_REFINE: diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 8aa6fa826a7..deafbc96704 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -19,6 +19,7 @@ from os.path import dirname from os.path import exists from os.path import join +import re import attr import saneyaml @@ -33,6 +34,10 @@ from licensedcode import MIN_MATCH_LENGTH from licensedcode import SMALL_RULE from licensedcode.tokenize import index_tokenizer +from licensedcode.tokenize import key_phrase_tokenizer +from licensedcode.tokenize import KEY_PHRASE_OPEN +from licensedcode.tokenize import KEY_PHRASE_CLOSE +from licensedcode.spans import Span from textcode.analysis import numbered_text_lines """ @@ -820,6 +825,10 @@ class BasicRule(object): # for SPDX license expression dynamic rules or testing stored_text = attr.ib(default=None, repr=False) + # spans with ispan positions which must be present in the license match for + # this rule to be considered a valid match + key_phrase_spans = attr.ib(default=attr.Factory(list), repr=False) + # These attributes are computed upon text loading or setting the thresholds ########################################################################### @@ -1171,6 +1180,13 @@ def tokens(self): self.length = length self.compute_relevance() + def key_phrases(self): + """ + Return an iterable of Spans marking the positions of key phrases that must + be present for this rule to be a valid match. + """ + yield from get_key_phrases(self.text()) + def compute_thresholds(self, small_rule=SMALL_RULE): """ Compute and set thresholds either considering the occurrence of all @@ -1648,3 +1664,32 @@ def find_rule_base_location(name_prefix, rules_directory=rules_data_dir): if not exists(f'{base_loc}.RULE'): return base_loc idx += 1 + + +def get_key_phrases(text): + """ + Return an iterable of Spans marking the positions of key phrases in the given + text string. Words are considered to be key phrases if they are enclosed in the + KEY_PHRASE_OPEN and KEY_PHRASE_CLOSE characters. + """ + key_phrase_iterator = key_phrase_tokenizer(text) + key_phrase_index = 0 + for token in key_phrase_iterator: + if token.startswith(KEY_PHRASE_OPEN): + span_positions = [] + + # keep appending key phrase until we hit KEY_PHRASE_CLOSE + for key_phrase in key_phrase_iterator: + if key_phrase.endswith(KEY_PHRASE_CLOSE): + break + span_positions.append(key_phrase_index) + key_phrase_index += 1 + + if not key_phrase.endswith(KEY_PHRASE_CLOSE): + span_start_position = span_positions[0] if span_positions else 0 + raise InvalidRule("Key phrase definition started at token '%d' is not closed" % span_start_position) + + if span_positions: + yield Span(span_positions) + else: + key_phrase_index += 1 diff --git a/src/licensedcode/query.py b/src/licensedcode/query.py index c1c02e3348a..31264784ce8 100644 --- a/src/licensedcode/query.py +++ b/src/licensedcode/query.py @@ -228,7 +228,7 @@ def __init__( # index of "known positions" (yes really!) to a number of unknown tokens # after that known position. For unknowns at the start, the position is # using the magic -1 key - self.unknowns_by_pos = defaultdict(int) + self.unknowns_by_pos = {} # Span of "known positions" (yes really!) followed by unknown token(s) self.unknowns_span = None @@ -236,7 +236,7 @@ def __init__( # index of "known positions" (yes really!) to a number of stopword # tokens after that known position. For stopwords at the start, the # position is using the magic -1 key - self.stopwords_by_pos = defaultdict(int) + self.stopwords_by_pos = {} # Span of "known positions" (yes really!) followed by stopwords self.stopwords_span = None @@ -355,12 +355,12 @@ def tokens_with_unknowns(self): """ unknowns = self.unknowns_by_pos # yield anything at the start - for _ in range(unknowns[-1]): + for _ in range(unknowns.get(-1, 0)): yield None for pos, token in enumerate(self.tokens): yield token - for _ in range(unknowns[pos]): + for _ in range(unknowns.get(pos, 0)): yield None def tokens_by_line( @@ -386,11 +386,13 @@ def tokens_by_line( # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append - self_unknowns_by_pos = self.unknowns_by_pos + # we use a defaultdict as a convenience at construction time + unknowns_by_pos = defaultdict(int) unknowns_pos = set() unknowns_pos_add = unknowns_pos.add - self_stopwords_by_pos = self.stopwords_by_pos + # we use a defaultdict as a convenience at construction time + stopwords_by_pos = defaultdict(int) stopwords_pos = set() stopwords_pos_add = stopwords_pos.add @@ -443,11 +445,11 @@ def tokens_by_line( # If we have not yet started globally, then all tokens # seen so far are stopwords and we keep a count of them # in the magic "-1" position. - self_stopwords_by_pos[-1] += 1 + stopwords_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos - self_stopwords_by_pos[known_pos] += 1 + stopwords_by_pos[known_pos] += 1 stopwords_pos_add(known_pos) # we do not track stopwords, only their position continue @@ -456,11 +458,11 @@ def tokens_by_line( # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. - self_unknowns_by_pos[-1] += 1 + unknowns_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos - self_unknowns_by_pos[known_pos] += 1 + unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) @@ -492,11 +494,14 @@ def tokens_by_line( yield line_tokens - # finally create a Span of positions followed by unkwnons and another - # for positions followed by stopwords used for intersection with the - # query span to do the scoring matches correctly + # finally update the attributes and create a Span of positions followed + # by unkwnons and another for positions followed by stopwords used for + # intersection with the query span to do the scoring matches correctly self.unknowns_span = Span(unknowns_pos) self.stopwords_span = Span(stopwords_pos) + # also convert the defaultdicts back to plain discts + self.unknowns_by_pos = dict(unknowns_by_pos) + self.stopwords_by_pos = dict(stopwords_by_pos) def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4): """ @@ -760,14 +765,14 @@ def tokens_with_unknowns(self): unknowns = self.query.unknowns_by_pos # yield anything at the start only if this is the first query run if self.start == 0: - for _ in range(unknowns[-1]): + for _ in range(unknowns.get(-1, 0)): yield None for pos, token in self.tokens_with_pos(): yield token if pos == self.end: break - for _ in range(unknowns[pos]): + for _ in range(unknowns.get(pos, 0)): yield None def tokens_with_pos(self): diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index 9d11a40d82b..c9a7be472de 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -62,6 +62,38 @@ def query_lines(location=None, query_string=None, strip=True, start_line=1): query_pattern = '[^_\\W]+\\+?[^_\\W]*' word_splitter = re.compile(query_pattern, re.UNICODE).findall +key_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})' +key_phrase_splitter = re.compile(key_phrase_pattern, re.UNICODE).findall + +KEY_PHRASE_OPEN = "{{" +KEY_PHRASE_CLOSE = "}}" + +def key_phrase_tokenizer(text, stopwords=STOPWORDS): + """ + Return an iterable of tokens from a unicode query test. It must behave identically as the `index_tokenizer` with the + exception that it returns KEY_PHRASE_OPEN and KEY_PHRASE_CLOSE as separate tokens so that they can be used to parse + key phrases. + """ + if not text: + return [] + words = key_phrase_splitter(text.lower()) + + new_words = [] + for word in words: + if word.startswith(KEY_PHRASE_OPEN): + new_words.append(KEY_PHRASE_OPEN) + + stripped_word = word + if stripped_word.startswith(KEY_PHRASE_OPEN): + stripped_word = stripped_word[2:] + if stripped_word.endswith(KEY_PHRASE_CLOSE): + stripped_word = stripped_word[:-2] + new_words.append(stripped_word) + + if word.endswith(KEY_PHRASE_CLOSE): + new_words.append(KEY_PHRASE_CLOSE) + + return (token for token in new_words if token and token not in stopwords) def index_tokenizer(text, stopwords=STOPWORDS): """ diff --git a/tests/licensedcode/data/datadriven/lic1/alexa-skills-kit-sdk-for-java.txt b/tests/licensedcode/data/datadriven/lic1/alexa-skills-kit-sdk-for-java.txt new file mode 100644 index 00000000000..f5a06bb448c --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/alexa-skills-kit-sdk-for-java.txt @@ -0,0 +1,62 @@ +# ASK SDK Local Debug (Java) + +## About + +ASK SDK Local Debug is a package which enables you to test your skill code locally against your skill invocations by routing requests to your developer machine. This enables you to verify changes quickly to skill code as you can test without needing to deploy skill code to Lambda. + +> NOTE: Feature is currently only available to customers in the NA region(https://developer.amazon.com/en-US/docs/alexa/custom-skills/develop-skills-in-multiple-languages.html#h2-multiple-endpoints). + +## Installation + +Add the Maven package - ask-sdk-local-debug >= v1.0.0 as a [test dependency](https://maven.apache.org/guides/introduction/introduction-to-dependency-mechanism.html#dependency-scope) to your skill package. + +## Configuration + +### USING with other IDEs and Debuggers + +1. To instantiate a connection to the local debugging service, run the local debug application with `com.amazon.ask.localdebug.LocalDebuggerInvoker` as the start up program and the following arguments: +``` +--accessToken +--skillID +--skillStreamHandlerClass +--region # Optional argument. Region defaults to NA. +``` + 1. ACCESS_TOKEN: + 1. Install ASK CLI v2 + 2. npm install ask-cli@2 -g + 3. Generate the accessToken using ASK CLI + 4. ask util generate-lwa-tokens --scopes alexa::ask:skills:debug + 5. You will be directed to a Login with Amazon page. Sign in and retrieve your ACCESS_TOKEN from the terminal. + 2. SKILL_ID: The ID of the skill you are trying to debug. Ensure that the developer account you used to login to obtain the access token has access to this skill. + 3. SKILL_STREAM_HANDLER_CLASS: The fully qualified name of the class in your skill package that implements either the SkillStreamHandler or SkillServlet class. + 4. REGION: The region of the developer account. The accepted values are NA(North America), FE(Far East), EU(Europe). Defaults to NA. Instructions on finding out your region can be found [here](https://developer.amazon.com/en-US/docs/alexa/ask-toolkit/vs-code-testing-simulator.html#test). +2. Configure your preferred IDE or other debugging tool to attach to the above process or execute directly from your preferred IDE. For example, in VS Code, this would be included in the `launch.json`: +``` +{ + "type": "java", + "request": "launch", + "name": "Skill Debug", + "program": "<5. Program>", + "args": [ + "--accessToken","", + "--skillId", "", + "--skillStreamHandlerClass", "", + "--region", "" # Optional argument. Region defaults to NA. + ] +} +``` + +## Things to note + +1. Local debugging is only available for a skill’s **`development`** stage. +2. A connection remains active for **1 hour. **You will need to reinstantiate the connection after 1 hour. +3. All Alexa requests for the skill will be routed to your development machine while the connection is active. +4. Only one connection session may be active for a given skill ID and developer account. + +## Opening Issues + +For bug reports, feature requests and questions, we would like to hear about it. Search the [existing issues](https://github.com/alexa/alexa-skills-kit-sdk-for-nodejs/issues) and try to make sure your problem doesn’t already exist before opening a new issue. It’s helpful if you include the version of the SDK, Node.js or browser environment and OS you’re using. Please include a stack trace and reduced repro case when appropriate, too. + +## License + +This SDK is distributed under the Apache License, Version 2.0, see LICENSE for more information. \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic1/alexa-skills-kit-sdk-for-java.txt.yml b/tests/licensedcode/data/datadriven/lic1/alexa-skills-kit-sdk-for-java.txt.yml new file mode 100644 index 00000000000..a9e02b1a85a --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/alexa-skills-kit-sdk-for-java.txt.yml @@ -0,0 +1,2 @@ +license_expressions: + - apache-2.0 diff --git a/tests/licensedcode/data/datadriven/lic1/boto-boto3-cw-example-creating-alarm.txt b/tests/licensedcode/data/datadriven/lic1/boto-boto3-cw-example-creating-alarm.txt new file mode 100644 index 00000000000..5a40b9b1e4f --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/boto-boto3-cw-example-creating-alarm.txt @@ -0,0 +1,148 @@ +.. Copyright 2010-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. + + This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 + International License (the "License"). You may not use this file except in compliance with the + License. A copy of the License is located at http://creativecommons.org/licenses/by-nc-sa/4.0/. + + This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied. See the License for the specific language governing permissions and + limitations under the License. + +.. _aws-boto3-cw-creating-alarms: + +#################################### +Creating alarms in Amazon CloudWatch +#################################### + +This Python example shows you how to: + +* Get basic information about your CloudWatch alarms + +* Create and delete a CloudWatch alarm + +The scenario +============ + +An alarm watches a single metric over a time period you specify, and performs one or more actions +based on the value of the metric relative to a given threshold over a number of time periods. + +In this example, Python code is used to create alarms in CloudWatch. The code +uses the AWS SDK for Python to create alarms using these methods of the AWS.CloudWatch client class: + +* `paginate(StateValue='INSUFFICIENT_DATA') `_. + +* `put_metric_alarm `_. + +* `delete_alarms `_. + +For more information about CloudWatch alarms, see `Creating Amazon CloudWatch Alarms `_ +in the *Amazon CloudWatch User Guide*. + +All the example code for the Amazon Web Services (AWS) SDK for Python is available `here on GitHub `_. + +Prerequisite tasks +================= + +To set up and run this example, you must first configure your AWS credentials, as described in :doc:`quickstart`. + +Describe alarms +=============== + +The example below shows how to: + +* List metric alarms for insufficient data using + `paginate(StateValue='INSUFFICIENT_DATA') `_. + +For more information about paginators see, :doc:`paginators` + +Example +------- + +.. code-block:: python + + import boto3 + + # Create CloudWatch client + cloudwatch = boto3.client('cloudwatch') + + # List alarms of insufficient data through the pagination interface + paginator = cloudwatch.get_paginator('describe_alarms') + for response in paginator.paginate(StateValue='INSUFFICIENT_DATA'): + print(response['MetricAlarms']) + +Create an alarm for a CloudWatch Metric alarm +============================================= + +Create or update an alarm and associate it with the specified metric alarm. Optionally, this operation +can associate one or more Amazon SNS resources with the alarm. + +When this operation creates an alarm, the alarm state is immediately set to :code:`INSUFFICIENT_DATA`. +The alarm is evaluated and its state is set appropriately. Any actions associated with the state are +then executed. + +When you update an existing alarm, its state is left unchanged, but the update completely overwrites +the previous configuration of the alarm. + +The example below shows how to: + +* Create or update a metric alarm using + `put_metric_alarm `_. + +Example +------- + +.. code-block:: python + + import boto3 + + # Create CloudWatch client + cloudwatch = boto3.client('cloudwatch') + + # Create alarm + cloudwatch.put_metric_alarm( + AlarmName='Web_Server_CPU_Utilization', + ComparisonOperator='GreaterThanThreshold', + EvaluationPeriods=1, + MetricName='CPUUtilization', + Namespace='AWS/EC2', + Period=60, + Statistic='Average', + Threshold=70.0, + ActionsEnabled=False, + AlarmDescription='Alarm when server CPU exceeds 70%', + Dimensions=[ + { + 'Name': 'InstanceId', + 'Value': 'INSTANCE_ID' + }, + ], + Unit='Seconds' + ) + + +Delete an alarm +=============== + +Delete the specified alarms. In the event of an error, no alarms are deleted. + +The example below shows how to: + +* Delete a metric alarm using + `delete_alarms `_. + +Example +------- + +.. code-block:: python + + import boto3 + + # Create CloudWatch client + cloudwatch = boto3.client('cloudwatch') + + # Delete alarm + cloudwatch.delete_alarms( + AlarmNames=['Web_Server_CPU_Utilization'], + ) + + diff --git a/tests/licensedcode/data/datadriven/lic1/boto-boto3-cw-example-creating-alarm.txt.yml b/tests/licensedcode/data/datadriven/lic1/boto-boto3-cw-example-creating-alarm.txt.yml new file mode 100644 index 00000000000..ae1e88edf4e --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/boto-boto3-cw-example-creating-alarm.txt.yml @@ -0,0 +1,2 @@ +license_expressions: + - cc-by-nc-sa-4.0 \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic1/erlware-relx.txt b/tests/licensedcode/data/datadriven/lic1/erlware-relx.txt new file mode 100644 index 00000000000..a143106abc1 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/erlware-relx.txt @@ -0,0 +1,194 @@ +Apache License +============== +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +------------------------------------------------------------ + +1. Definitions. + + "License" shall mean the terms and conditions for use, + reproduction, and distribution as defined by Sections 1 through 9 + of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making + modifications, including but not limited to software source code, + documentation source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but not + limited to compiled object code, generated documentation, and + conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work (an + example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other + modifications represent, as a whole, an original work of + authorship. For the purposes of this License, Derivative Works + shall not include works that remain separable from, or merely link + (or bind by name) to the interfaces of, the Work and Derivative + Works thereof. + + "Contribution" shall mean any work of authorship, including the + original version of the Work and any modifications or additions to + that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright + owner or by an individual or Legal Entity authorized to submit on + behalf of the copyright owner. For the purposes of this definition, + "submitted" means any form of electronic, verbal, or written + communication sent to the Licensor or its representatives, + including but not limited to communication on electronic mailing + lists, source code control systems, and issue tracking systems that + are managed by, or on behalf of, the Licensor for the purpose of + discussing and improving the Work, but excluding communication that + is conspicuously marked or otherwise designated in writing by the + copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal + Entity on behalf of whom a Contribution has been received by + Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have + made, use, offer to sell, sell, import, and otherwise transfer the + Work, where such license applies only to those patent claims + licensable by such Contributor that are necessarily infringed by + their Contribution(s) alone or by combination of their + Contribution(s) with the Work to which such Contribution(s) was + submitted. If You institute patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging + that the Work or a Contribution incorporated within the Work + constitutes direct or contributory patent infringement, then any + patent licenses granted to You under this License for that Work + shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work + or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You meet + the following conditions: + + (a) You must give any other recipients of the Work or Derivative + Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that + You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, excluding + those notices that do not pertain to any part of the Derivative + Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one of + the following places: within a NOTICE text file distributed as + part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and do + not modify the License. You may add Your own attribution notices + within Derivative Works that You distribute, alongside or as an + addendum to the NOTICE text from the Work, provided that such + additional attribution notices cannot be construed as modifying + the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing + the origin of the Work and reproducing the content of the NOTICE + file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed + to in writing, Licensor provides the Work (and each Contributor + provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES + OR CONDITIONS OF ANY KIND, either express or implied, including, + without limitation, any warranties or conditions of TITLE, + NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR + PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this + License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor has + been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, and + charge a fee for, acceptance of support, warranty, indemnity, or + other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. diff --git a/tests/licensedcode/data/datadriven/lic1/erlware-relx.txt.yml b/tests/licensedcode/data/datadriven/lic1/erlware-relx.txt.yml new file mode 100644 index 00000000000..4a7047a6dfa --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/erlware-relx.txt.yml @@ -0,0 +1,4 @@ +license_expressions: + - apache-2.0 + - apache-2.0 + - apache-2.0 \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic1/jenkinsci-role-strategy-plugin.txt b/tests/licensedcode/data/datadriven/lic1/jenkinsci-role-strategy-plugin.txt new file mode 100644 index 00000000000..a0a29d07580 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/jenkinsci-role-strategy-plugin.txt @@ -0,0 +1,109 @@ + + 4.0.0 + + + org.jenkins-ci.plugins + plugin + 4.18 + + + + role-strategy + hpi + ${revision}${changelist} + Role-based Authorization Strategy + https://github.com/jenkinsci/role-strategy-plugin + + + scm:git:ssh://github.com/jenkinsci/${project.artifactId}-plugin.git + scm:git:ssh://git@github.com/jenkinsci/${project.artifactId}-plugin.git + https://github.com/jenkinsci/${project.artifactId}-plugin + ${scmTag} + + + + + tmaurel + Thomas Maurel + +1 + + + rseguy + Romain Seguy + +1 + + + Oleg Nenashev + oleg_nenashev + o.v.nenashev@gmail.com + + + + + 3.2.1 + -SNAPSHOT + 2.222.4 + 8 + true + + + + + repo.jenkins-ci.org + https://repo.jenkins-ci.org/public/ + + + + + + repo.jenkins-ci.org + https://repo.jenkins-ci.org/public/ + + + + + + MIT License + https://opensource.org/licenses/MIT + repo + + + + + + + io.jenkins.tools.bom + bom-2.222.x + 887.vae9c8ac09ff7 + import + pom + + + + + + + org.jenkins-ci.plugins + matrix-auth + + + io.jenkins.plugins + caffeine-api + + + org.jenkins-ci.plugins + cloudbees-folder + test + + + io.jenkins + configuration-as-code + true + + + io.jenkins.configuration-as-code + test-harness + test + + + diff --git a/tests/licensedcode/data/datadriven/lic1/jenkinsci-role-strategy-plugin.txt.yml b/tests/licensedcode/data/datadriven/lic1/jenkinsci-role-strategy-plugin.txt.yml new file mode 100644 index 00000000000..eedfaa085e5 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/jenkinsci-role-strategy-plugin.txt.yml @@ -0,0 +1,3 @@ +license_expressions: + - mit + - mit diff --git a/tests/licensedcode/data/datadriven/lic1/uwiger-parse_trans.txt b/tests/licensedcode/data/datadriven/lic1/uwiger-parse_trans.txt new file mode 100644 index 00000000000..a1dfd493896 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/uwiger-parse_trans.txt @@ -0,0 +1,25 @@ +%%% The contents of this file are subject to the Erlang Public License, +%%% Version 1.1, (the "License"); you may not use this file except in +%%% compliance with the License. You may obtain a copy of the License at +%%% http://www.erlang.org/EPLICENSE +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is parse_trans-2.0. +%% +%% Copyright (c) 2014 Ericsson AB +%% +%% Contributor(s): ______________________________________. + +%%------------------------------------------------------------------- +%% File : codegen.hrl +%% @author : Ulf Wiger +%% @end +%% Description : +%% +%% Created : 25 Feb 2010 by Ulf Wiger +%%------------------------------------------------------------------- +-compile({parse_transform, parse_trans_codegen}). diff --git a/tests/licensedcode/data/datadriven/lic1/uwiger-parse_trans.txt.yml b/tests/licensedcode/data/datadriven/lic1/uwiger-parse_trans.txt.yml new file mode 100644 index 00000000000..2011cf30b7b --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic1/uwiger-parse_trans.txt.yml @@ -0,0 +1,3 @@ +license_expressions: + - erlangpl-1.1 + - erlangpl-1.1 \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic2/python-cpython-classes.txt b/tests/licensedcode/data/datadriven/lic2/python-cpython-classes.txt new file mode 100644 index 00000000000..0d780e3ba89 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic2/python-cpython-classes.txt @@ -0,0 +1,927 @@ +.. _tut-classes: + +******* +Classes +******* + +Classes provide a means of bundling data and functionality together. Creating +a new class creates a new *type* of object, allowing new *instances* of that +type to be made. Each class instance can have attributes attached to it for +maintaining its state. Class instances can also have methods (defined by its +class) for modifying its state. + +Compared with other programming languages, Python's class mechanism adds classes +with a minimum of new syntax and semantics. It is a mixture of the class +mechanisms found in C++ and Modula-3. Python classes provide all the standard +features of Object Oriented Programming: the class inheritance mechanism allows +multiple base classes, a derived class can override any methods of its base +class or classes, and a method can call the method of a base class with the same +name. Objects can contain arbitrary amounts and kinds of data. As is true for +modules, classes partake of the dynamic nature of Python: they are created at +runtime, and can be modified further after creation. + +In C++ terminology, normally class members (including the data members) are +*public* (except see below :ref:`tut-private`), and all member functions are +*virtual*. As in Modula-3, there are no shorthands for referencing the object's +members from its methods: the method function is declared with an explicit first +argument representing the object, which is provided implicitly by the call. As +in Smalltalk, classes themselves are objects. This provides semantics for +importing and renaming. Unlike C++ and Modula-3, built-in types can be used as +base classes for extension by the user. Also, like in C++, most built-in +operators with special syntax (arithmetic operators, subscripting etc.) can be +redefined for class instances. + +(Lacking universally accepted terminology to talk about classes, I will make +occasional use of Smalltalk and C++ terms. I would use Modula-3 terms, since +its object-oriented semantics are closer to those of Python than C++, but I +expect that few readers have heard of it.) + + +.. _tut-object: + +A Word About Names and Objects +============================== + +Objects have individuality, and multiple names (in multiple scopes) can be bound +to the same object. This is known as aliasing in other languages. This is +usually not appreciated on a first glance at Python, and can be safely ignored +when dealing with immutable basic types (numbers, strings, tuples). However, +aliasing has a possibly surprising effect on the semantics of Python code +involving mutable objects such as lists, dictionaries, and most other types. +This is usually used to the benefit of the program, since aliases behave like +pointers in some respects. For example, passing an object is cheap since only a +pointer is passed by the implementation; and if a function modifies an object +passed as an argument, the caller will see the change --- this eliminates the +need for two different argument passing mechanisms as in Pascal. + + +.. _tut-scopes: + +Python Scopes and Namespaces +============================ + +Before introducing classes, I first have to tell you something about Python's +scope rules. Class definitions play some neat tricks with namespaces, and you +need to know how scopes and namespaces work to fully understand what's going on. +Incidentally, knowledge about this subject is useful for any advanced Python +programmer. + +Let's begin with some definitions. + +A *namespace* is a mapping from names to objects. Most namespaces are currently +implemented as Python dictionaries, but that's normally not noticeable in any +way (except for performance), and it may change in the future. Examples of +namespaces are: the set of built-in names (containing functions such as :func:`abs`, and +built-in exception names); the global names in a module; and the local names in +a function invocation. In a sense the set of attributes of an object also form +a namespace. The important thing to know about namespaces is that there is +absolutely no relation between names in different namespaces; for instance, two +different modules may both define a function ``maximize`` without confusion --- +users of the modules must prefix it with the module name. + +By the way, I use the word *attribute* for any name following a dot --- for +example, in the expression ``z.real``, ``real`` is an attribute of the object +``z``. Strictly speaking, references to names in modules are attribute +references: in the expression ``modname.funcname``, ``modname`` is a module +object and ``funcname`` is an attribute of it. In this case there happens to be +a straightforward mapping between the module's attributes and the global names +defined in the module: they share the same namespace! [#]_ + +Attributes may be read-only or writable. In the latter case, assignment to +attributes is possible. Module attributes are writable: you can write +``modname.the_answer = 42``. Writable attributes may also be deleted with the +:keyword:`del` statement. For example, ``del modname.the_answer`` will remove +the attribute :attr:`the_answer` from the object named by ``modname``. + +Namespaces are created at different moments and have different lifetimes. The +namespace containing the built-in names is created when the Python interpreter +starts up, and is never deleted. The global namespace for a module is created +when the module definition is read in; normally, module namespaces also last +until the interpreter quits. The statements executed by the top-level +invocation of the interpreter, either read from a script file or interactively, +are considered part of a module called :mod:`__main__`, so they have their own +global namespace. (The built-in names actually also live in a module; this is +called :mod:`builtins`.) + +The local namespace for a function is created when the function is called, and +deleted when the function returns or raises an exception that is not handled +within the function. (Actually, forgetting would be a better way to describe +what actually happens.) Of course, recursive invocations each have their own +local namespace. + +A *scope* is a textual region of a Python program where a namespace is directly +accessible. "Directly accessible" here means that an unqualified reference to a +name attempts to find the name in the namespace. + +Although scopes are determined statically, they are used dynamically. At any +time during execution, there are 3 or 4 nested scopes whose namespaces are +directly accessible: + +* the innermost scope, which is searched first, contains the local names +* the scopes of any enclosing functions, which are searched starting with the + nearest enclosing scope, contains non-local, but also non-global names +* the next-to-last scope contains the current module's global names +* the outermost scope (searched last) is the namespace containing built-in names + +If a name is declared global, then all references and assignments go directly to +the middle scope containing the module's global names. To rebind variables +found outside of the innermost scope, the :keyword:`nonlocal` statement can be +used; if not declared nonlocal, those variables are read-only (an attempt to +write to such a variable will simply create a *new* local variable in the +innermost scope, leaving the identically named outer variable unchanged). + +Usually, the local scope references the local names of the (textually) current +function. Outside functions, the local scope references the same namespace as +the global scope: the module's namespace. Class definitions place yet another +namespace in the local scope. + +It is important to realize that scopes are determined textually: the global +scope of a function defined in a module is that module's namespace, no matter +from where or by what alias the function is called. On the other hand, the +actual search for names is done dynamically, at run time --- however, the +language definition is evolving towards static name resolution, at "compile" +time, so don't rely on dynamic name resolution! (In fact, local variables are +already determined statically.) + +A special quirk of Python is that -- if no :keyword:`global` or :keyword:`nonlocal` +statement is in effect -- assignments to names always go into the innermost scope. +Assignments do not copy data --- they just bind names to objects. The same is true +for deletions: the statement ``del x`` removes the binding of ``x`` from the +namespace referenced by the local scope. In fact, all operations that introduce +new names use the local scope: in particular, :keyword:`import` statements and +function definitions bind the module or function name in the local scope. + +The :keyword:`global` statement can be used to indicate that particular +variables live in the global scope and should be rebound there; the +:keyword:`nonlocal` statement indicates that particular variables live in +an enclosing scope and should be rebound there. + +.. _tut-scopeexample: + +Scopes and Namespaces Example +----------------------------- + +This is an example demonstrating how to reference the different scopes and +namespaces, and how :keyword:`global` and :keyword:`nonlocal` affect variable +binding:: + + def scope_test(): + def do_local(): + spam = "local spam" + + def do_nonlocal(): + nonlocal spam + spam = "nonlocal spam" + + def do_global(): + global spam + spam = "global spam" + + spam = "test spam" + do_local() + print("After local assignment:", spam) + do_nonlocal() + print("After nonlocal assignment:", spam) + do_global() + print("After global assignment:", spam) + + scope_test() + print("In global scope:", spam) + +The output of the example code is: + +.. code-block:: none + + After local assignment: test spam + After nonlocal assignment: nonlocal spam + After global assignment: nonlocal spam + In global scope: global spam + +Note how the *local* assignment (which is default) didn't change *scope_test*\'s +binding of *spam*. The :keyword:`nonlocal` assignment changed *scope_test*\'s +binding of *spam*, and the :keyword:`global` assignment changed the module-level +binding. + +You can also see that there was no previous binding for *spam* before the +:keyword:`global` assignment. + + +.. _tut-firstclasses: + +A First Look at Classes +======================= + +Classes introduce a little bit of new syntax, three new object types, and some +new semantics. + + +.. _tut-classdefinition: + +Class Definition Syntax +----------------------- + +The simplest form of class definition looks like this:: + + class ClassName: + + . + . + . + + +Class definitions, like function definitions (:keyword:`def` statements) must be +executed before they have any effect. (You could conceivably place a class +definition in a branch of an :keyword:`if` statement, or inside a function.) + +In practice, the statements inside a class definition will usually be function +definitions, but other statements are allowed, and sometimes useful --- we'll +come back to this later. The function definitions inside a class normally have +a peculiar form of argument list, dictated by the calling conventions for +methods --- again, this is explained later. + +When a class definition is entered, a new namespace is created, and used as the +local scope --- thus, all assignments to local variables go into this new +namespace. In particular, function definitions bind the name of the new +function here. + +When a class definition is left normally (via the end), a *class object* is +created. This is basically a wrapper around the contents of the namespace +created by the class definition; we'll learn more about class objects in the +next section. The original local scope (the one in effect just before the class +definition was entered) is reinstated, and the class object is bound here to the +class name given in the class definition header (:class:`ClassName` in the +example). + + +.. _tut-classobjects: + +Class Objects +------------- + +Class objects support two kinds of operations: attribute references and +instantiation. + +*Attribute references* use the standard syntax used for all attribute references +in Python: ``obj.name``. Valid attribute names are all the names that were in +the class's namespace when the class object was created. So, if the class +definition looked like this:: + + class MyClass: + """A simple example class""" + i = 12345 + + def f(self): + return 'hello world' + +then ``MyClass.i`` and ``MyClass.f`` are valid attribute references, returning +an integer and a function object, respectively. Class attributes can also be +assigned to, so you can change the value of ``MyClass.i`` by assignment. +:attr:`__doc__` is also a valid attribute, returning the docstring belonging to +the class: ``"A simple example class"``. + +Class *instantiation* uses function notation. Just pretend that the class +object is a parameterless function that returns a new instance of the class. +For example (assuming the above class):: + + x = MyClass() + +creates a new *instance* of the class and assigns this object to the local +variable ``x``. + +The instantiation operation ("calling" a class object) creates an empty object. +Many classes like to create objects with instances customized to a specific +initial state. Therefore a class may define a special method named +:meth:`__init__`, like this:: + + def __init__(self): + self.data = [] + +When a class defines an :meth:`__init__` method, class instantiation +automatically invokes :meth:`__init__` for the newly-created class instance. So +in this example, a new, initialized instance can be obtained by:: + + x = MyClass() + +Of course, the :meth:`__init__` method may have arguments for greater +flexibility. In that case, arguments given to the class instantiation operator +are passed on to :meth:`__init__`. For example, :: + + >>> class Complex: + ... def __init__(self, realpart, imagpart): + ... self.r = realpart + ... self.i = imagpart + ... + >>> x = Complex(3.0, -4.5) + >>> x.r, x.i + (3.0, -4.5) + + +.. _tut-instanceobjects: + +Instance Objects +---------------- + +Now what can we do with instance objects? The only operations understood by +instance objects are attribute references. There are two kinds of valid +attribute names: data attributes and methods. + +*data attributes* correspond to "instance variables" in Smalltalk, and to "data +members" in C++. Data attributes need not be declared; like local variables, +they spring into existence when they are first assigned to. For example, if +``x`` is the instance of :class:`MyClass` created above, the following piece of +code will print the value ``16``, without leaving a trace:: + + x.counter = 1 + while x.counter < 10: + x.counter = x.counter * 2 + print(x.counter) + del x.counter + +The other kind of instance attribute reference is a *method*. A method is a +function that "belongs to" an object. (In Python, the term method is not unique +to class instances: other object types can have methods as well. For example, +list objects have methods called append, insert, remove, sort, and so on. +However, in the following discussion, we'll use the term method exclusively to +mean methods of class instance objects, unless explicitly stated otherwise.) + +.. index:: object: method + +Valid method names of an instance object depend on its class. By definition, +all attributes of a class that are function objects define corresponding +methods of its instances. So in our example, ``x.f`` is a valid method +reference, since ``MyClass.f`` is a function, but ``x.i`` is not, since +``MyClass.i`` is not. But ``x.f`` is not the same thing as ``MyClass.f`` --- it +is a *method object*, not a function object. + + +.. _tut-methodobjects: + +Method Objects +-------------- + +Usually, a method is called right after it is bound:: + + x.f() + +In the :class:`MyClass` example, this will return the string ``'hello world'``. +However, it is not necessary to call a method right away: ``x.f`` is a method +object, and can be stored away and called at a later time. For example:: + + xf = x.f + while True: + print(xf()) + +will continue to print ``hello world`` until the end of time. + +What exactly happens when a method is called? You may have noticed that +``x.f()`` was called without an argument above, even though the function +definition for :meth:`f` specified an argument. What happened to the argument? +Surely Python raises an exception when a function that requires an argument is +called without any --- even if the argument isn't actually used... + +Actually, you may have guessed the answer: the special thing about methods is +that the instance object is passed as the first argument of the function. In our +example, the call ``x.f()`` is exactly equivalent to ``MyClass.f(x)``. In +general, calling a method with a list of *n* arguments is equivalent to calling +the corresponding function with an argument list that is created by inserting +the method's instance object before the first argument. + +If you still don't understand how methods work, a look at the implementation can +perhaps clarify matters. When a non-data attribute of an instance is +referenced, the instance's class is searched. If the name denotes a valid class +attribute that is a function object, a method object is created by packing +(pointers to) the instance object and the function object just found together in +an abstract object: this is the method object. When the method object is called +with an argument list, a new argument list is constructed from the instance +object and the argument list, and the function object is called with this new +argument list. + + +.. _tut-class-and-instance-variables: + +Class and Instance Variables +---------------------------- + +Generally speaking, instance variables are for data unique to each instance +and class variables are for attributes and methods shared by all instances +of the class:: + + class Dog: + + kind = 'canine' # class variable shared by all instances + + def __init__(self, name): + self.name = name # instance variable unique to each instance + + >>> d = Dog('Fido') + >>> e = Dog('Buddy') + >>> d.kind # shared by all dogs + 'canine' + >>> e.kind # shared by all dogs + 'canine' + >>> d.name # unique to d + 'Fido' + >>> e.name # unique to e + 'Buddy' + +As discussed in :ref:`tut-object`, shared data can have possibly surprising +effects with involving :term:`mutable` objects such as lists and dictionaries. +For example, the *tricks* list in the following code should not be used as a +class variable because just a single list would be shared by all *Dog* +instances:: + + class Dog: + + tricks = [] # mistaken use of a class variable + + def __init__(self, name): + self.name = name + + def add_trick(self, trick): + self.tricks.append(trick) + + >>> d = Dog('Fido') + >>> e = Dog('Buddy') + >>> d.add_trick('roll over') + >>> e.add_trick('play dead') + >>> d.tricks # unexpectedly shared by all dogs + ['roll over', 'play dead'] + +Correct design of the class should use an instance variable instead:: + + class Dog: + + def __init__(self, name): + self.name = name + self.tricks = [] # creates a new empty list for each dog + + def add_trick(self, trick): + self.tricks.append(trick) + + >>> d = Dog('Fido') + >>> e = Dog('Buddy') + >>> d.add_trick('roll over') + >>> e.add_trick('play dead') + >>> d.tricks + ['roll over'] + >>> e.tricks + ['play dead'] + + +.. _tut-remarks: + +Random Remarks +============== + +.. These should perhaps be placed more carefully... + +If the same attribute name occurs in both an instance and in a class, +then attribute lookup prioritizes the instance:: + + >>> class Warehouse: + purpose = 'storage' + region = 'west' + + >>> w1 = Warehouse() + >>> print(w1.purpose, w1.region) + storage west + >>> w2 = Warehouse() + >>> w2.region = 'east' + >>> print(w2.purpose, w2.region) + storage east + +Data attributes may be referenced by methods as well as by ordinary users +("clients") of an object. In other words, classes are not usable to implement +pure abstract data types. In fact, nothing in Python makes it possible to +enforce data hiding --- it is all based upon convention. (On the other hand, +the Python implementation, written in C, can completely hide implementation +details and control access to an object if necessary; this can be used by +extensions to Python written in C.) + +Clients should use data attributes with care --- clients may mess up invariants +maintained by the methods by stamping on their data attributes. Note that +clients may add data attributes of their own to an instance object without +affecting the validity of the methods, as long as name conflicts are avoided --- +again, a naming convention can save a lot of headaches here. + +There is no shorthand for referencing data attributes (or other methods!) from +within methods. I find that this actually increases the readability of methods: +there is no chance of confusing local variables and instance variables when +glancing through a method. + +Often, the first argument of a method is called ``self``. This is nothing more +than a convention: the name ``self`` has absolutely no special meaning to +Python. Note, however, that by not following the convention your code may be +less readable to other Python programmers, and it is also conceivable that a +*class browser* program might be written that relies upon such a convention. + +Any function object that is a class attribute defines a method for instances of +that class. It is not necessary that the function definition is textually +enclosed in the class definition: assigning a function object to a local +variable in the class is also ok. For example:: + + # Function defined outside the class + def f1(self, x, y): + return min(x, x+y) + + class C: + f = f1 + + def g(self): + return 'hello world' + + h = g + +Now ``f``, ``g`` and ``h`` are all attributes of class :class:`C` that refer to +function objects, and consequently they are all methods of instances of +:class:`C` --- ``h`` being exactly equivalent to ``g``. Note that this practice +usually only serves to confuse the reader of a program. + +Methods may call other methods by using method attributes of the ``self`` +argument:: + + class Bag: + def __init__(self): + self.data = [] + + def add(self, x): + self.data.append(x) + + def addtwice(self, x): + self.add(x) + self.add(x) + +Methods may reference global names in the same way as ordinary functions. The +global scope associated with a method is the module containing its +definition. (A class is never used as a global scope.) While one +rarely encounters a good reason for using global data in a method, there are +many legitimate uses of the global scope: for one thing, functions and modules +imported into the global scope can be used by methods, as well as functions and +classes defined in it. Usually, the class containing the method is itself +defined in this global scope, and in the next section we'll find some good +reasons why a method would want to reference its own class. + +Each value is an object, and therefore has a *class* (also called its *type*). +It is stored as ``object.__class__``. + + +.. _tut-inheritance: + +Inheritance +=========== + +Of course, a language feature would not be worthy of the name "class" without +supporting inheritance. The syntax for a derived class definition looks like +this:: + + class DerivedClassName(BaseClassName): + + . + . + . + + +The name :class:`BaseClassName` must be defined in a scope containing the +derived class definition. In place of a base class name, other arbitrary +expressions are also allowed. This can be useful, for example, when the base +class is defined in another module:: + + class DerivedClassName(modname.BaseClassName): + +Execution of a derived class definition proceeds the same as for a base class. +When the class object is constructed, the base class is remembered. This is +used for resolving attribute references: if a requested attribute is not found +in the class, the search proceeds to look in the base class. This rule is +applied recursively if the base class itself is derived from some other class. + +There's nothing special about instantiation of derived classes: +``DerivedClassName()`` creates a new instance of the class. Method references +are resolved as follows: the corresponding class attribute is searched, +descending down the chain of base classes if necessary, and the method reference +is valid if this yields a function object. + +Derived classes may override methods of their base classes. Because methods +have no special privileges when calling other methods of the same object, a +method of a base class that calls another method defined in the same base class +may end up calling a method of a derived class that overrides it. (For C++ +programmers: all methods in Python are effectively ``virtual``.) + +An overriding method in a derived class may in fact want to extend rather than +simply replace the base class method of the same name. There is a simple way to +call the base class method directly: just call ``BaseClassName.methodname(self, +arguments)``. This is occasionally useful to clients as well. (Note that this +only works if the base class is accessible as ``BaseClassName`` in the global +scope.) + +Python has two built-in functions that work with inheritance: + +* Use :func:`isinstance` to check an instance's type: ``isinstance(obj, int)`` + will be ``True`` only if ``obj.__class__`` is :class:`int` or some class + derived from :class:`int`. + +* Use :func:`issubclass` to check class inheritance: ``issubclass(bool, int)`` + is ``True`` since :class:`bool` is a subclass of :class:`int`. However, + ``issubclass(float, int)`` is ``False`` since :class:`float` is not a + subclass of :class:`int`. + + + +.. _tut-multiple: + +Multiple Inheritance +-------------------- + +Python supports a form of multiple inheritance as well. A class definition with +multiple base classes looks like this:: + + class DerivedClassName(Base1, Base2, Base3): + + . + . + . + + +For most purposes, in the simplest cases, you can think of the search for +attributes inherited from a parent class as depth-first, left-to-right, not +searching twice in the same class where there is an overlap in the hierarchy. +Thus, if an attribute is not found in :class:`DerivedClassName`, it is searched +for in :class:`Base1`, then (recursively) in the base classes of :class:`Base1`, +and if it was not found there, it was searched for in :class:`Base2`, and so on. + +In fact, it is slightly more complex than that; the method resolution order +changes dynamically to support cooperative calls to :func:`super`. This +approach is known in some other multiple-inheritance languages as +call-next-method and is more powerful than the super call found in +single-inheritance languages. + +Dynamic ordering is necessary because all cases of multiple inheritance exhibit +one or more diamond relationships (where at least one of the parent classes +can be accessed through multiple paths from the bottommost class). For example, +all classes inherit from :class:`object`, so any case of multiple inheritance +provides more than one path to reach :class:`object`. To keep the base classes +from being accessed more than once, the dynamic algorithm linearizes the search +order in a way that preserves the left-to-right ordering specified in each +class, that calls each parent only once, and that is monotonic (meaning that a +class can be subclassed without affecting the precedence order of its parents). +Taken together, these properties make it possible to design reliable and +extensible classes with multiple inheritance. For more detail, see +https://www.python.org/download/releases/2.3/mro/. + + +.. _tut-private: + +Private Variables +================= + +"Private" instance variables that cannot be accessed except from inside an +object don't exist in Python. However, there is a convention that is followed +by most Python code: a name prefixed with an underscore (e.g. ``_spam``) should +be treated as a non-public part of the API (whether it is a function, a method +or a data member). It should be considered an implementation detail and subject +to change without notice. + +.. index:: + pair: name; mangling + +Since there is a valid use-case for class-private members (namely to avoid name +clashes of names with names defined by subclasses), there is limited support for +such a mechanism, called :dfn:`name mangling`. Any identifier of the form +``__spam`` (at least two leading underscores, at most one trailing underscore) +is textually replaced with ``_classname__spam``, where ``classname`` is the +current class name with leading underscore(s) stripped. This mangling is done +without regard to the syntactic position of the identifier, as long as it +occurs within the definition of a class. + +Name mangling is helpful for letting subclasses override methods without +breaking intraclass method calls. For example:: + + class Mapping: + def __init__(self, iterable): + self.items_list = [] + self.__update(iterable) + + def update(self, iterable): + for item in iterable: + self.items_list.append(item) + + __update = update # private copy of original update() method + + class MappingSubclass(Mapping): + + def update(self, keys, values): + # provides new signature for update() + # but does not break __init__() + for item in zip(keys, values): + self.items_list.append(item) + +The above example would work even if ``MappingSubclass`` were to introduce a +``__update`` identifier since it is replaced with ``_Mapping__update`` in the +``Mapping`` class and ``_MappingSubclass__update`` in the ``MappingSubclass`` +class respectively. + +Note that the mangling rules are designed mostly to avoid accidents; it still is +possible to access or modify a variable that is considered private. This can +even be useful in special circumstances, such as in the debugger. + +Notice that code passed to ``exec()`` or ``eval()`` does not consider the +classname of the invoking class to be the current class; this is similar to the +effect of the ``global`` statement, the effect of which is likewise restricted +to code that is byte-compiled together. The same restriction applies to +``getattr()``, ``setattr()`` and ``delattr()``, as well as when referencing +``__dict__`` directly. + + +.. _tut-odds: + +Odds and Ends +============= + +Sometimes it is useful to have a data type similar to the Pascal "record" or C +"struct", bundling together a few named data items. An empty class definition +will do nicely:: + + class Employee: + pass + + john = Employee() # Create an empty employee record + + # Fill the fields of the record + john.name = 'John Doe' + john.dept = 'computer lab' + john.salary = 1000 + +A piece of Python code that expects a particular abstract data type can often be +passed a class that emulates the methods of that data type instead. For +instance, if you have a function that formats some data from a file object, you +can define a class with methods :meth:`read` and :meth:`!readline` that get the +data from a string buffer instead, and pass it as an argument. + +.. (Unfortunately, this technique has its limitations: a class can't define + operations that are accessed by special syntax such as sequence subscripting + or arithmetic operators, and assigning such a "pseudo-file" to sys.stdin will + not cause the interpreter to read further input from it.) + +Instance method objects have attributes, too: ``m.__self__`` is the instance +object with the method :meth:`m`, and ``m.__func__`` is the function object +corresponding to the method. + + +.. _tut-iterators: + +Iterators +========= + +By now you have probably noticed that most container objects can be looped over +using a :keyword:`for` statement:: + + for element in [1, 2, 3]: + print(element) + for element in (1, 2, 3): + print(element) + for key in {'one':1, 'two':2}: + print(key) + for char in "123": + print(char) + for line in open("myfile.txt"): + print(line, end='') + +This style of access is clear, concise, and convenient. The use of iterators +pervades and unifies Python. Behind the scenes, the :keyword:`for` statement +calls :func:`iter` on the container object. The function returns an iterator +object that defines the method :meth:`~iterator.__next__` which accesses +elements in the container one at a time. When there are no more elements, +:meth:`~iterator.__next__` raises a :exc:`StopIteration` exception which tells the +:keyword:`!for` loop to terminate. You can call the :meth:`~iterator.__next__` method +using the :func:`next` built-in function; this example shows how it all works:: + + >>> s = 'abc' + >>> it = iter(s) + >>> it + + >>> next(it) + 'a' + >>> next(it) + 'b' + >>> next(it) + 'c' + >>> next(it) + Traceback (most recent call last): + File "", line 1, in + next(it) + StopIteration + +Having seen the mechanics behind the iterator protocol, it is easy to add +iterator behavior to your classes. Define an :meth:`__iter__` method which +returns an object with a :meth:`~iterator.__next__` method. If the class +defines :meth:`__next__`, then :meth:`__iter__` can just return ``self``:: + + class Reverse: + """Iterator for looping over a sequence backwards.""" + def __init__(self, data): + self.data = data + self.index = len(data) + + def __iter__(self): + return self + + def __next__(self): + if self.index == 0: + raise StopIteration + self.index = self.index - 1 + return self.data[self.index] + +:: + + >>> rev = Reverse('spam') + >>> iter(rev) + <__main__.Reverse object at 0x00A1DB50> + >>> for char in rev: + ... print(char) + ... + m + a + p + s + + +.. _tut-generators: + +Generators +========== + +:term:`Generators ` are a simple and powerful tool for creating iterators. They +are written like regular functions but use the :keyword:`yield` statement +whenever they want to return data. Each time :func:`next` is called on it, the +generator resumes where it left off (it remembers all the data values and which +statement was last executed). An example shows that generators can be trivially +easy to create:: + + def reverse(data): + for index in range(len(data)-1, -1, -1): + yield data[index] + +:: + + >>> for char in reverse('golf'): + ... print(char) + ... + f + l + o + g + +Anything that can be done with generators can also be done with class-based +iterators as described in the previous section. What makes generators so +compact is that the :meth:`__iter__` and :meth:`~generator.__next__` methods +are created automatically. + +Another key feature is that the local variables and execution state are +automatically saved between calls. This made the function easier to write and +much more clear than an approach using instance variables like ``self.index`` +and ``self.data``. + +In addition to automatic method creation and saving program state, when +generators terminate, they automatically raise :exc:`StopIteration`. In +combination, these features make it easy to create iterators with no more effort +than writing a regular function. + + +.. _tut-genexps: + +Generator Expressions +===================== + +Some simple generators can be coded succinctly as expressions using a syntax +similar to list comprehensions but with parentheses instead of square brackets. +These expressions are designed for situations where the generator is used right +away by an enclosing function. Generator expressions are more compact but less +versatile than full generator definitions and tend to be more memory friendly +than equivalent list comprehensions. + +Examples:: + + >>> sum(i*i for i in range(10)) # sum of squares + 285 + + >>> xvec = [10, 20, 30] + >>> yvec = [7, 5, 3] + >>> sum(x*y for x,y in zip(xvec, yvec)) # dot product + 260 + + >>> unique_words = set(word for line in page for word in line.split()) + + >>> valedictorian = max((student.gpa, student.name) for student in graduates) + + >>> data = 'golf' + >>> list(data[i] for i in range(len(data)-1, -1, -1)) + ['f', 'l', 'o', 'g'] + + + +.. rubric:: Footnotes + +.. [#] Except for one thing. Module objects have a secret read-only attribute called + :attr:`~object.__dict__` which returns the dictionary used to implement the module's + namespace; the name :attr:`~object.__dict__` is an attribute but not a global name. + Obviously, using this violates the abstraction of namespace implementation, and + should be restricted to things like post-mortem debuggers. diff --git a/tests/licensedcode/data/datadriven/lic2/python-cpython-classes.txt.yml b/tests/licensedcode/data/datadriven/lic2/python-cpython-classes.txt.yml new file mode 100644 index 00000000000..33de434c831 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic2/python-cpython-classes.txt.yml @@ -0,0 +1,5 @@ +license_expressions: +notes: | + It does not contain any license declarations, but matched with + `bsd-simplified_and_gpl-2.0_1.RULE` because of "unless explicitly stated + otherwise". \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic3/python-mypy-manifest.txt b/tests/licensedcode/data/datadriven/lic3/python-mypy-manifest.txt new file mode 100644 index 00000000000..fc657091a3d --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic3/python-mypy-manifest.txt @@ -0,0 +1,47 @@ +# some of the prunes here are so that check-manifest doesn't complain about their exclusion +# as such, be judicious in your use of prune + +# stubs +prune mypy/typeshed +include mypy/typeshed/LICENSE +include mypy/typeshed/stdlib/VERSIONS +recursive-include mypy/typeshed *.pyi + +# mypy and mypyc +include mypy/py.typed +recursive-include mypy *.py +recursive-include mypyc *.py + +# random +include mypy_bootstrap.ini +graft mypy/xml +graft scripts + +# docs +graft docs +prune docs/build +prune docs/source/_build + +# assorted mypyc requirements +graft mypyc/external +graft mypyc/lib-rt +graft mypyc/test-data +graft mypyc/doc + +# files necessary for testing sdist +include mypy-requirements.txt +include build-requirements.txt +include test-requirements.txt +include mypy_self_check.ini +prune misc +include misc/proper_plugin.py +graft test-data +include conftest.py +include runtests.py +include pytest.ini + +include LICENSE mypyc/README.md +exclude .gitmodules CONTRIBUTING.md CREDITS ROADMAP.md tox.ini action.yml + +global-exclude *.py[cod] +global-exclude .DS_Store diff --git a/tests/licensedcode/data/datadriven/lic3/python-mypy-manifest.txt.yml b/tests/licensedcode/data/datadriven/lic3/python-mypy-manifest.txt.yml new file mode 100644 index 00000000000..4da1ec42f00 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic3/python-mypy-manifest.txt.yml @@ -0,0 +1,4 @@ +license_expressions: +notes: | + This file does not contain any license declarations. It should not match with + `apache-2.0_and_cc-by-sa-4.0_2.RULE` at all. \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic4/mattjohnsonpint-timezoneconverter-license.txt b/tests/licensedcode/data/datadriven/lic4/mattjohnsonpint-timezoneconverter-license.txt new file mode 100644 index 00000000000..7516f6555d1 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic4/mattjohnsonpint-timezoneconverter-license.txt @@ -0,0 +1,30 @@ +TimeZoneConverter +Copyright (c) 2017 Matt Johnson-Pint +https://github.com/mattjohnsonpint/TimeZoneConverter + +While we certainly hope this software is useful, none of the authors or +contributors place any guarantees as to the accuracy of the data or the +results returned by using this library. + +This library is distributed under the terms of the MIT License: + +----------------------------------------------------------------------------- +The MIT License (MIT) + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +----------------------------------------------------------------------------- \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic4/mattjohnsonpint-timezoneconverter-license.txt.yml b/tests/licensedcode/data/datadriven/lic4/mattjohnsonpint-timezoneconverter-license.txt.yml new file mode 100644 index 00000000000..2adad98be25 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic4/mattjohnsonpint-timezoneconverter-license.txt.yml @@ -0,0 +1,3 @@ +license_expressions: + - mit + - mit \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic4/saxon-he-pom.txt b/tests/licensedcode/data/datadriven/lic4/saxon-he-pom.txt new file mode 100644 index 00000000000..bcb91f7d3fd --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic4/saxon-he-pom.txt @@ -0,0 +1,289 @@ + + + 4.0.0 + net.sf.saxon + Saxon-HE-enhanced-accuracy + 10.3 + jar + Saxon-HE - European Norm 16931: enhanced financial accuracy + The XSLT Processor is using the far more accurate decimal-based floating-point arithmetic and half-up rounding for VAT rounding according to EU law + http://www.saxonica.com/ + + + Mozilla Public License Version 2.0 + http://www.mozilla.org/MPL/2.0/ + repo + + + + scm:svn:https://dev.saxonica.com/repos/archive/opensource/ + scm:svn:https://dev.saxonica.com/repos/archive/opensource/ + https://dev.saxonica.com/repos/archive/opensource/ + + + + mike + Michael Kay + mike@saxonica.com + + + ond1 + O'Neil Delpratt + oneil@saxonica.com + + + debbie + Debbie Lockett + debbie@saxonica.com + + + john + John Lumley + john@saxonica.com + + + norm + Norman Walsh + norm@saxonica.com + + + + UTF-8 + false + true + false + yyyy-MM-dd'T'HH:mm:ss + 1.8 + 1.8 + + + Saxon Community + https://saxonica.plan.io/projects/saxon + + 1999 + + + User List + saxon-help@lists.sourceforge.net + http://sourceforge.net/mailarchive/forum.php?forum_name=saxon-help + + + + + org.jdom + jdom + 2.0.2 + true + + + org.jdom + jdom2 + 2.0.6 + true + + + com.io7m.xom + xom + 1.2.10 + true + + + + com.ibm.icu + icu4j + 68.1 + true + + + org.dom4j + dom4j + 2.1.3 + true + + + xml-resolver + xml-resolver + 1.2 + true + + + org.apache.ws.commons.axiom + axiom-impl + 1.2.22 + true + + + jline + jline + 2.14.6 + true + + + + org.jetbrains + annotations + 19.0.0 + + + org.junit.jupiter + junit-jupiter-engine + 5.7.2 + test + + + org.junit.platform + junit-platform-runner + 1.7.2 + test + + + + Saxonica + http://www.saxonica.com + + + org.sonatype.oss + oss-parent + 9 + + + + + + + maven-compiler-plugin + 3.8.1 + + + maven-surefire-plugin + 3.0.0-M5 + + + slow + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + + net.sf.saxon.Transform + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.3.0 + + ${project.artifactId} + + http://docs.oracle.com/javase/8/docs/api/ + http://xerces.apache.org/xerces-j/apiDocs/ + + true + ${project.artifactId} API v${project.version} - ${project.scm.url} + + + + attach-javadocs + + jar + + + ${javadoc.opts} + + + + + + maven-source-plugin + 3.2.1 + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.22.2 + + + + ${project.version} + + + + + failsafe-it + + integration-test + verify + + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.3.0 + + ${project.artifactId} + 512m + 1024m + + http://download.oracle.com/javase/8/docs/api/ + http://xerces.apache.org/xerces-j/apiDocs/ + + true + ${project.artifactId} API v${project.version} - ${project.scm.url} + + + + + org.apache.maven.plugins + maven-failsafe-plugin + 3.0.0-M5 + + + integration-tests + + report-only + + + failsafe-report + + + + + + + + \ No newline at end of file diff --git a/tests/licensedcode/data/datadriven/lic4/saxon-he-pom.txt.yml b/tests/licensedcode/data/datadriven/lic4/saxon-he-pom.txt.yml new file mode 100644 index 00000000000..d5a26661fa2 --- /dev/null +++ b/tests/licensedcode/data/datadriven/lic4/saxon-he-pom.txt.yml @@ -0,0 +1,2 @@ +license_expressions: + - mpl-2.0 \ No newline at end of file diff --git a/tests/licensedcode/data/detect/rule_template/rules/0149_sun-bcl-11-07.RULE b/tests/licensedcode/data/detect/rule_template/rules/0149_sun-bcl-11-07.RULE index 4c78865d113..689d5d54783 100644 --- a/tests/licensedcode/data/detect/rule_template/rules/0149_sun-bcl-11-07.RULE +++ b/tests/licensedcode/data/detect/rule_template/rules/0149_sun-bcl-11-07.RULE @@ -134,7 +134,7 @@ No modification of this Agreement will be binding, unless in writing and signed by an authorized representative of each party. -{{ProductName}} +[[ProductName]] SUPPLEMENTAL LICENSE TERMS diff --git a/tests/licensedcode/data/query/unknown_positions/lz4.license.txt b/tests/licensedcode/data/query/unknown_positions/lz4.license.txt new file mode 100644 index 00000000000..c221aebd26e --- /dev/null +++ b/tests/licensedcode/data/query/unknown_positions/lz4.license.txt @@ -0,0 +1,11 @@ +This repository uses 2 different licenses : +- all files in the `lib` directory use a BSD 2-Clause license +- all other files use a GPLv2 license, unless explicitly stated otherwise + +Relevant license is reminded at the top of each source file, +and with presence of COPYING or LICENSE file in associated directories. + +This model is selected to emphasize that +files in the `lib` directory are designed to be included into 3rd party applications, +while all other files, in `programs`, `tests` or `examples`, +receive more limited attention and support for such scenario. diff --git a/tests/licensedcode/data/tokenize/htmlish.html b/tests/licensedcode/data/tokenize/htmlish.html index e0e848599eb..309cca50f64 100644 --- a/tests/licensedcode/data/tokenize/htmlish.html +++ b/tests/licensedcode/data/tokenize/htmlish.html @@ -93,7 +93,7 @@
-

Third party services

+

{{Third party services}}

Of course we are all for responsible disclosure, which you might have seen if you have read our blog. So here we have listed all the third party services we use in different ways in alphabetical order and an added a short comment on why or how we use them.

diff --git a/tests/licensedcode/data/tokenize/htmlish.html.expected.key_phrase_tokenizer.json b/tests/licensedcode/data/tokenize/htmlish.html.expected.key_phrase_tokenizer.json new file mode 100644 index 00000000000..1e4610fd92a --- /dev/null +++ b/tests/licensedcode/data/tokenize/htmlish.html.expected.key_phrase_tokenizer.json @@ -0,0 +1,2137 @@ +[ + [ + "doctype", + "html" + ], + [ + "html", + "lang", + "en", + "us", + "no", + "js" + ], + [ + "head" + ], + [ + "meta", + "charset", + "utf", + "8" + ], + [], + [ + "meta", + "name", + "apple", + "mobile", + "web", + "app", + "capable", + "content", + "yes" + ], + [ + "meta", + "name", + "handheldfriendly", + "content", + "true" + ], + [ + "meta", + "name", + "viewport", + "content", + "device", + "initial", + "scale", + "1", + "0", + "minimum", + "scale", + "1", + "0", + "minimal", + "ui" + ], + [ + "meta", + "name", + "google", + "site", + "verification", + "content", + "uftfn12ezxjaco9n1sqxui5kgagjrdjs", + "abg7l2nazq" + ], + [ + "title", + "detectify", + "go", + "hack", + "yourself", + "title" + ], + [ + "meta", + "name", + "description", + "content", + "detectify", + "provides", + "user", + "friendly", + "and", + "thorough", + "web", + "security", + "scan", + "that", + "allows", + "you", + "to", + "focus", + "100", + "on", + "web", + "development" + ], + [ + "link", + "shortcut", + "icon", + "favicon", + "ico", + "type", + "image", + "x", + "icon" + ], + [ + "link", + "stylesheet", + "static", + "gfx", + "css", + "newdefault", + "css", + "3900721321" + ], + [ + "type", + "text", + "javascript", + "static", + "js", + "libs", + "modernizr", + "2", + "0", + "6", + "dev", + "min", + "js" + ], + [ + "type", + "text", + "javascript", + "dynamic", + "globals", + "head", + "3900721321" + ], + [ + "type", + "text", + "javascript" + ], + [ + "var", + "ajax", + "key", + "1152637df16c5354802b2c083c8aa334" + ], + [], + [ + "head" + ], + [ + "no", + "js" + ], + [ + "type", + "text", + "javascript", + "dynamic", + "globals", + "3900721321" + ], + [ + "noscript", + "iframe", + "www", + "googletagmanager", + "com", + "ns", + "html", + "id", + "gtm", + "twt88b", + "0", + "0", + "style", + "display", + "none", + "visibility", + "hidden", + "iframe", + "noscript" + ], + [ + "main", + "cont", + "top" + ], + [ + "id", + "grid", + "hidden" + ], + [], + [ + "header", + "textpage", + "transparent", + "small", + "header" + ], + [ + "main", + "cont", + "clearfix" + ], + [ + "left" + ], + [ + "logo", + "icon", + "title", + "detectify", + "go", + "hack", + "yourself" + ], + [ + "logo", + "text", + "white", + "title", + "detectify", + "go", + "hack", + "yourself" + ], + [], + [ + "clean", + "left", + "top" + ], + [ + "about", + "about" + ], + [ + "pricing", + "pricing" + ], + [ + "technology", + "technology" + ], + [], + [ + "mobile", + "button", + "lines" + ], + [ + "right" + ], + [ + "clean", + "inline", + "list", + "header", + "list" + ], + [ + "welcome", + "account", + "button", + "white", + "sign", + "up" + ], + [ + "log", + "in", + "button", + "no", + "white", + "login", + "sign", + "in" + ], + [], + [], + [], + [ + "header" + ], + [ + "mobile", + "dropdown" + ], + [ + "mobile", + "menu" + ], + [ + "clean", + "inline", + "list", + "header", + "list" + ], + [ + "about", + "button", + "no", + "white", + "about" + ], + [ + "pricing", + "button", + "no", + "white", + "pricing" + ], + [], + [ + "welcome", + "account", + "button", + "white", + "sign", + "up" + ], + [ + "log", + "in", + "button", + "no", + "white", + "login", + "sign", + "in" + ], + [], + [ + "clean", + "inline", + "list", + "header", + "list" + ], + [ + "about", + "button", + "no", + "white", + "about" + ], + [ + "about", + "button", + "white", + "detectify" + ], + [ + "about", + "topic", + "team", + "button", + "no", + "white", + "team", + "detectify" + ], + [], + [ + "clean", + "inline", + "list", + "header", + "list" + ], + [ + "technology", + "topic", + "technology", + "button", + "no", + "white", + "technology" + ], + [ + "technology", + "topic", + "features", + "button", + "white", + "features" + ], + [ + "technology", + "topic", + "engine", + "button", + "no", + "white", + "detectify", + "engine" + ], + [ + "technology", + "topic", + "how", + "we", + "handle", + "security", + "button", + "no", + "white", + "our", + "security" + ], + [], + [], + [], + [], + [ + "login", + "splash" + ], + [ + "main", + "cont" + ], + [ + "card", + "cont", + "m" + ], + [ + "id", + "login", + "prompt", + "card", + "login" + ], + [ + "form", + "autocomplete", + "off", + "method", + "post", + "id", + "login", + "form", + "action", + "login" + ], + [ + "input", + "type", + "hidden", + "name", + "url", + "value", + "https", + "detectify", + "com", + "services" + ], + [ + "input", + "type", + "hidden", + "ajax", + "input", + "name", + "inp", + "f75d20f75dd6167f9703e86136252c9f", + "user", + "login", + "ajax" + ], + [ + "logo", + "text", + "black" + ], + [ + "center", + "sign", + "in", + "text", + "sign", + "in", + "to", + "continue", + "to", + "your", + "detectify", + "dashboard" + ], + [ + "login", + "error", + "your", + "username", + "or", + "password", + "is", + "incorrect" + ], + [ + "input", + "autocomplete", + "on", + "id", + "user", + "type", + "text", + "value", + "name", + "inp", + "f75d20f75dd6167f9703e86136252c9f", + "user", + "login", + "email", + "placeholder", + "email" + ], + [ + "input", + "id", + "pass", + "type", + "password", + "value", + "name", + "inp", + "f75d20f75dd6167f9703e86136252c9f", + "user", + "login", + "password", + "placeholder", + "password" + ], + [ + "button", + "button", + "input", + "size", + "blue", + "filled", + "thin", + "sign", + "in", + "button" + ], + [ + "clearfix", + "card", + "bottom" + ], + [ + "left", + "welcome", + "account", + "create", + "an", + "account" + ], + [ + "right", + "forgot", + "forgot", + "password" + ], + [], + [ + "form" + ], + [], + [ + "card", + "footer" + ], + [ + "darker", + "security", + "during", + "sign", + "in", + "we", + "use", + "https", + "digicert", + "high", + "assurance", + "to", + "encrypt", + "your", + "information" + ], + [], + [], + [], + [], + [ + "main", + "cont" + ], + [ + "main", + "text", + "cont" + ], + [ + "{{", + "third", + "party", + "services", + "}}" + ], + [], + [ + "of", + "course", + "we", + "are", + "all", + "for", + "responsible", + "disclosure", + "which", + "you", + "might", + "have", + "seen", + "if", + "you", + "have", + "read", + "our", + "blog", + "so", + "here", + "we", + "have", + "listed", + "all", + "the", + "third", + "party", + "services", + "we", + "use", + "in", + "different", + "ways", + "in", + "alphabetical", + "order", + "and", + "an", + "added", + "short", + "comment", + "on", + "why", + "or", + "how", + "we", + "use", + "them" + ], + [], + [], + [ + "big", + "main", + "text", + "cont" + ], + [ + "adyen" + ], + [], + [ + "for", + "payments", + "we", + "use", + "adyen", + "you", + "can", + "read", + "their", + "http", + "www", + "adyen", + "com", + "downloads", + "contract", + "adyentermsandconditions", + "pdf", + "terms", + "and", + "conditions", + "on", + "their", + "website" + ], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "amazon" + ], + [ + "we", + "use", + "various", + "amazon", + "services", + "in", + "ireland", + "including", + "amazon", + "ec2", + "amazon", + "rds", + "amazon", + "ses", + "and", + "amazon", + "s3", + "to", + "provide", + "detectify", + "to", + "you" + ], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "desk" + ], + [], + [ + "whenever", + "you", + "have", + "question", + "for", + "us", + "we", + "will", + "receive", + "that", + "question", + "and", + "answer", + "it", + "via", + "desk", + "you", + "find", + "their", + "http", + "www", + "desk", + "com", + "privacy", + "privacy", + "policy", + "here", + "and", + "their", + "http", + "www", + "desk", + "com", + "terms", + "terms", + "of", + "use", + "here" + ], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [], + [ + "disqus" + ], + [], + [ + "we", + "use", + "disqus", + "as", + "the", + "commenting", + "service", + "to", + "our", + "blog", + "so", + "perhaps", + "you", + "want", + "to", + "read", + "through", + "their", + "http", + "help", + "disqus", + "com", + "customer", + "portal", + "articles", + "466260", + "terms", + "of", + "service", + "terms", + "of", + "service", + "and", + "http", + "help", + "disqus", + "com", + "customer", + "portal", + "articles", + "466259", + "privacy", + "policy", + "privacy", + "policy" + ], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "effective", + "tlds" + ], + [], + [ + "used", + "by", + "registered", + "domain", + "libs" + ], + [ + "http", + "mxr", + "mozilla", + "org", + "mozilla", + "central", + "source", + "netwerk", + "dns", + "effective", + "tld", + "names", + "dat" + ], + [], + [], + [], + [ + "begin", + "license", + "block" + ], + [ + "version", + "mpl", + "1", + "1", + "gpl", + "2", + "0", + "lgpl", + "2", + "1" + ], + [], + [ + "the", + "contents", + "of", + "this", + "file", + "are", + "subject", + "to", + "the", + "mozilla", + "public", + "license", + "version" + ], + [ + "1", + "1", + "the", + "license", + "you", + "may", + "not", + "use", + "this", + "file", + "except", + "in", + "compliance", + "with" + ], + [ + "the", + "license", + "you", + "may", + "obtain", + "copy", + "of", + "the", + "license", + "at" + ], + [ + "http", + "www", + "mozilla", + "org", + "mpl" + ], + [], + [ + "software", + "distributed", + "under", + "the", + "license", + "is", + "distributed", + "on", + "an", + "as", + "is", + "basis" + ], + [ + "without", + "warranty", + "of", + "any", + "kind", + "either", + "express", + "or", + "implied", + "see", + "the", + "license" + ], + [ + "for", + "the", + "specific", + "language", + "governing", + "rights", + "and", + "limitations", + "under", + "the" + ], + [ + "license" + ], + [], + [ + "the", + "original", + "code", + "is", + "the", + "public", + "suffix", + "list" + ], + [], + [ + "the", + "initial", + "developer", + "of", + "the", + "original", + "code", + "is" + ], + [ + "jo", + "hermans", + "jo", + "hermans", + "gmail", + "com" + ], + [ + "portions", + "created", + "by", + "the", + "initial", + "developer", + "are", + "copyright", + "c", + "2007" + ], + [ + "the", + "initial", + "developer", + "all", + "rights", + "reserved" + ], + [], + [ + "contributor" + ], + [ + "ruben", + "arakelyan", + "ruben", + "rubenarakelyan", + "com" + ], + [ + "gervase", + "markham", + "gerv", + "gerv", + "net" + ], + [ + "pamela", + "greene", + "pamg", + "bugs", + "gmail", + "com" + ], + [ + "david", + "triendl", + "david", + "triendl", + "name" + ], + [ + "jothan", + "frakes", + "jothan", + "gmail", + "com" + ], + [ + "the", + "kind", + "representatives", + "of", + "many", + "tld", + "registries" + ], + [], + [ + "alternatively", + "the", + "contents", + "of", + "this", + "file", + "may", + "be", + "used", + "under", + "the", + "terms", + "of" + ], + [ + "either", + "the", + "gnu", + "general", + "public", + "license", + "version", + "2", + "or", + "later", + "the", + "gpl", + "or" + ], + [ + "the", + "gnu", + "lesser", + "general", + "public", + "license", + "version", + "2", + "1", + "or", + "later", + "the", + "lgpl" + ], + [ + "in", + "which", + "case", + "the", + "provisions", + "of", + "the", + "gpl", + "or", + "the", + "lgpl", + "are", + "applicable", + "instead" + ], + [ + "of", + "those", + "above", + "if", + "you", + "wish", + "to", + "allow", + "use", + "of", + "your", + "version", + "of", + "this", + "file", + "only" + ], + [ + "under", + "the", + "terms", + "of", + "either", + "the", + "gpl", + "or", + "the", + "lgpl", + "and", + "not", + "to", + "allow", + "others", + "to" + ], + [ + "use", + "your", + "version", + "of", + "this", + "file", + "under", + "the", + "terms", + "of", + "the", + "mpl", + "indicate", + "your" + ], + [ + "decision", + "by", + "deleting", + "the", + "provisions", + "above", + "and", + "replace", + "them", + "with", + "the", + "notice" + ], + [ + "and", + "other", + "provisions", + "required", + "by", + "the", + "gpl", + "or", + "the", + "lgpl", + "if", + "you", + "do", + "not", + "delete" + ], + [ + "the", + "provisions", + "above", + "recipient", + "may", + "use", + "your", + "version", + "of", + "this", + "file", + "under" + ], + [ + "the", + "terms", + "of", + "any", + "one", + "of", + "the", + "mpl", + "the", + "gpl", + "or", + "the", + "lgpl" + ], + [], + [ + "end", + "license", + "block" + ], + [], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "google", + "adwords" + ], + [ + "we", + "have", + "google", + "adwords", + "tracking", + "on", + "the", + "page", + "to", + "optimize", + "our", + "advertising", + "within", + "the", + "network", + "you", + "find", + "their", + "http", + "www", + "google", + "com", + "intl", + "en", + "policies", + "privacy", + "privacy", + "policy", + "here" + ], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "google", + "analytics" + ], + [], + [ + "we", + "use", + "google", + "analytics", + "to", + "analyse", + "the", + "website", + "traffic", + "to", + "learn", + "more", + "about", + "the", + "http", + "www", + "google", + "com", + "analytics", + "terms", + "us", + "html", + "google", + "analytics", + "terms", + "of", + "service", + "you", + "can", + "read", + "here", + "you", + "can", + "read", + "more", + "about", + "the", + "http", + "www", + "google", + "com", + "analytics", + "learn", + "privacy", + "html", + "privacy", + "and", + "about", + "safeguarding", + "your", + "data", + "here" + ], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "idna", + "convert" + ], + [], + [ + "we", + "use", + "this", + "to", + "parse", + "idna", + "urls" + ], + [], + [], + [ + "{{", + "license" + ], + [], + [ + "vim", + "set", + "expandtab", + "tabstop", + "4", + "shiftwidth", + "4", + "softtabstop", + "4", + "foldmethod", + "marker" + ], + [], + [], + [ + "this", + "library", + "is", + "free", + "software", + "you", + "can", + "redistribute", + "it", + "and", + "or", + "modify" + ], + [ + "it", + "under", + "the", + "terms", + "of", + "the", + "gnu", + "lesser", + "general", + "public", + "license", + "as" + ], + [ + "published", + "by", + "the", + "free", + "software", + "foundation", + "either", + "version", + "2", + "1", + "of", + "the" + ], + [ + "license", + "or", + "at", + "your", + "option", + "any", + "later", + "version" + ], + [], + [ + "this", + "library", + "is", + "distributed", + "in", + "the", + "hope", + "that", + "it", + "will", + "be", + "useful", + "but" + ], + [ + "without", + "any", + "warranty", + "without", + "even", + "the", + "implied", + "warranty", + "of" + ], + [ + "merchantability", + "or", + "fitness", + "for", + "particular", + "purpose", + "see", + "the", + "gnu" + ], + [ + "lesser", + "general", + "public", + "license", + "for", + "more", + "details" + ], + [], + [ + "you", + "should", + "have", + "received", + "copy", + "of", + "the", + "gnu", + "lesser", + "general", + "public" + ], + [ + "license", + "along", + "with", + "this", + "library", + "if", + "not", + "write", + "to", + "the", + "free", + "software" + ], + [ + "foundation", + "inc", + "59", + "temple", + "place", + "suite", + "330", + "boston", + "ma", + "02111", + "1307" + ], + [ + "usa" + ], + [], + [], + [], + [ + "}}" + ], + [], + [], + [], + [ + "encode", + "decode", + "internationalized", + "domain", + "names" + ], + [], + [ + "the", + "allows", + "to", + "convert", + "internationalized", + "domain", + "names" + ], + [ + "see", + "rfc", + "3490", + "for", + "details", + "as", + "they", + "can", + "be", + "used", + "with", + "various", + "registries", + "worldwide" + ], + [ + "to", + "be", + "translated", + "between", + "their", + "original", + "localized", + "form", + "and", + "their", + "encoded", + "form" + ], + [ + "as", + "it", + "will", + "be", + "used", + "in", + "the", + "dns", + "domain", + "name", + "system" + ], + [], + [ + "the", + "provides", + "two", + "public", + "methods", + "encode", + "and", + "decode", + "which", + "do", + "exactly" + ], + [ + "what", + "you", + "would", + "expect", + "them", + "to", + "do", + "you", + "are", + "allowed", + "to", + "use", + "complete", + "domain", + "names" + ], + [ + "simple", + "strings", + "and", + "complete", + "email", + "addresses", + "as", + "well", + "that", + "means", + "that", + "you", + "might" + ], + [ + "use", + "any", + "of", + "the", + "following", + "notations" + ], + [], + [ + "www", + "n\u00f6rgler", + "com" + ], + [ + "xn", + "nrgler", + "wxa" + ], + [ + "xn", + "brse", + "5qa", + "xn", + "knrz", + "1ra", + "info" + ], + [], + [ + "unicode", + "input", + "might", + "be", + "given", + "as", + "either", + "utf", + "8", + "string", + "ucs", + "4", + "string", + "or", + "ucs", + "4", + "array" + ], + [ + "unicode", + "output", + "is", + "available", + "in", + "the", + "same", + "formats" + ], + [ + "you", + "can", + "select", + "your", + "preferred", + "format", + "via", + "link", + "set", + "paramter" + ], + [], + [ + "ace", + "input", + "and", + "output", + "is", + "always", + "expected", + "to", + "be", + "ascii" + ], + [], + [ + "author", + "matthias", + "sommerfeld", + "mso", + "phlylabs", + "de" + ], + [ + "copyright", + "2004", + "2011", + "phlylabs", + "berlin", + "http", + "phlylabs", + "de" + ], + [ + "version", + "0", + "8", + "0", + "2011", + "03", + "11" + ], + [], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [], + [ + "malwaredomainlist" + ], + [], + [ + "we", + "use", + "malwaredomainlist", + "com", + "malwaredomainlist", + "com", + "list", + "of", + "infected", + "domains", + "to", + "see", + "if", + "scanned", + "domain", + "contains", + "malware" + ], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "mysql" + ], + [ + "for", + "some", + "of", + "our", + "temporary", + "storage", + "we", + "use", + "hardened", + "version", + "of", + "mysql" + ], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "new", + "bsd", + "license", + "bsd" + ], + [], + [ + "copyright", + "c", + "2010", + "noesis", + "innovation" + ], + [ + "all", + "rights", + "reserved" + ], + [], + [], + [ + "redistribution", + "and", + "use", + "in", + "source", + "and", + "binary", + "forms", + "with", + "or", + "without", + "modification", + "are", + "permitted", + "provided", + "that", + "the", + "following", + "conditions", + "are", + "met" + ], + [], + [], + [ + "redistributions", + "of", + "source", + "code", + "must", + "retain", + "the", + "above", + "copyright", + "notice", + "this", + "list", + "of", + "conditions", + "and", + "the", + "following", + "disclaimer" + ], + [ + "redistributions", + "in", + "binary", + "form", + "must", + "reproduce", + "the", + "above", + "copyright", + "notice", + "this", + "list", + "of", + "conditions", + "and", + "the", + "following", + "disclaimer", + "in", + "the", + "documentation", + "and", + "or", + "other", + "materials", + "provided", + "with", + "the", + "distribution" + ], + [ + "neither", + "the", + "name", + "of", + "noesis", + "innovation", + "nor", + "the", + "names", + "of", + "its", + "contributors", + "may", + "be", + "used", + "to", + "endorse", + "or", + "promote", + "products", + "derived", + "from", + "this", + "software", + "without", + "specific", + "prior", + "written", + "permission" + ], + [], + [], + [ + "this", + "software", + "is", + "provided", + "by", + "the", + "copyright", + "holders", + "and", + "contributors", + "as", + "is", + "and", + "any", + "express", + "or", + "implied", + "warranties", + "including", + "but", + "not", + "limited", + "to", + "the", + "implied", + "warranties", + "of", + "merchantability", + "and", + "fitness", + "for", + "particular", + "purpose", + "are", + "disclaimed", + "in", + "no", + "event", + "shall", + "the", + "copyright", + "owner", + "or", + "contributors", + "be", + "liable", + "for", + "any", + "direct", + "indirect", + "incidental", + "special", + "exemplary", + "or", + "consequential", + "damages", + "including", + "but", + "not", + "limited", + "to", + "procurement", + "of", + "substitute", + "goods", + "or", + "services", + "loss", + "of", + "use", + "data", + "or", + "profits", + "or", + "business", + "interruption", + "however", + "caused", + "and", + "on", + "any", + "theory", + "of", + "liability" + ], + [ + "whether", + "in", + "contract", + "strict", + "liability", + "or", + "tort", + "including", + "negligence", + "or", + "otherwise", + "arising", + "in", + "any", + "way", + "out", + "of", + "the", + "use", + "of", + "this", + "software", + "even", + "if", + "advised", + "of", + "the", + "possibility", + "of", + "such", + "damage" + ], + [], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "optimizely" + ] +] \ No newline at end of file diff --git a/tests/licensedcode/data/tokenize/htmlish.html.expected.query_lines.json b/tests/licensedcode/data/tokenize/htmlish.html.expected.query_lines.json index ab4b19334bb..ff5fb326421 100644 --- a/tests/licensedcode/data/tokenize/htmlish.html.expected.query_lines.json +++ b/tests/licensedcode/data/tokenize/htmlish.html.expected.query_lines.json @@ -381,7 +381,7 @@ ], [ 96, - "

Third party services

" + "

{{Third party services}}

" ], [ 97, diff --git a/tests/licensedcode/data/tokenize/htmlish.txt b/tests/licensedcode/data/tokenize/htmlish.txt index 342555ab3ca..1cff3b2db94 100644 --- a/tests/licensedcode/data/tokenize/htmlish.txt +++ b/tests/licensedcode/data/tokenize/htmlish.txt @@ -1,5 +1,5 @@
-

MySQL

+

{{MySQL}}

For some of our temporary storage, we use a hardened version of MySQL.


diff --git a/tests/licensedcode/data/tokenize/htmlish.txt.expected.key_phrase_tokenizer.json b/tests/licensedcode/data/tokenize/htmlish.txt.expected.key_phrase_tokenizer.json new file mode 100644 index 00000000000..d24890fa9e7 --- /dev/null +++ b/tests/licensedcode/data/tokenize/htmlish.txt.expected.key_phrase_tokenizer.json @@ -0,0 +1,285 @@ +[ + [ + "small", + "main", + "text", + "cont" + ], + [ + "{{", + "mysql", + "}}" + ], + [ + "for", + "some", + "of", + "our", + "temporary", + "storage", + "we", + "use", + "hardened", + "version", + "of", + "mysql" + ], + [], + [ + "small", + "main", + "text", + "cont" + ], + [ + "new", + "bsd", + "license", + "bsd" + ], + [], + [ + "copyright", + "c", + "2010", + "noesis", + "innovation" + ], + [ + "all", + "rights", + "reserved" + ], + [], + [], + [ + "redistribution", + "and", + "use", + "in", + "source", + "and", + "binary", + "forms", + "with", + "or", + "without", + "modification", + "are", + "permitted", + "provided", + "that", + "the", + "following", + "conditions", + "are", + "met" + ], + [], + [], + [ + "redistributions", + "of", + "source", + "code", + "must", + "retain", + "the", + "above", + "copyright", + "notice", + "this", + "list", + "of", + "conditions", + "and", + "the", + "following", + "disclaimer" + ], + [ + "redistributions", + "in", + "binary", + "form", + "must", + "reproduce", + "the", + "above", + "copyright", + "notice", + "this", + "list", + "of", + "conditions", + "and", + "the", + "following", + "disclaimer", + "in", + "the", + "documentation", + "and", + "or", + "other", + "materials", + "provided", + "with", + "the", + "distribution" + ], + [ + "neither", + "the", + "name", + "of", + "noesis", + "innovation", + "nor", + "the", + "names", + "of", + "its", + "contributors", + "may", + "be", + "used", + "to", + "endorse", + "or", + "promote", + "products", + "derived", + "from", + "this", + "software", + "without", + "specific", + "prior", + "written", + "permission" + ], + [], + [], + [ + "this", + "software", + "is", + "provided", + "by", + "the", + "copyright", + "holders", + "and", + "contributors", + "as", + "is", + "and", + "any", + "express", + "or", + "implied", + "warranties", + "including", + "but", + "not", + "limited", + "to", + "the", + "implied", + "warranties", + "of", + "merchantability", + "and", + "fitness", + "for", + "particular", + "purpose", + "are", + "disclaimed", + "in", + "no", + "event", + "shall", + "the", + "copyright", + "owner", + "or", + "contributors", + "be", + "liable", + "for", + "any", + "direct", + "indirect", + "incidental", + "special", + "exemplary", + "or", + "consequential", + "damages", + "including", + "but", + "not", + "limited", + "to", + "procurement", + "of", + "substitute", + "goods", + "or", + "services", + "loss", + "of", + "use", + "data", + "or", + "profits", + "or", + "business", + "interruption", + "however", + "caused", + "and", + "on", + "any", + "theory", + "of", + "liability" + ], + [ + "whether", + "in", + "contract", + "strict", + "liability", + "or", + "tort", + "including", + "negligence", + "or", + "otherwise", + "arising", + "in", + "any", + "way", + "out", + "of", + "the", + "use", + "of", + "this", + "software", + "even", + "if", + "advised", + "of", + "the", + "possibility", + "of", + "such", + "damage" + ], + [], + [] +] \ No newline at end of file diff --git a/tests/licensedcode/data/tokenize/htmlish.txt.expected.query_lines.json b/tests/licensedcode/data/tokenize/htmlish.txt.expected.query_lines.json index db207e4c130..861e1305aa6 100644 --- a/tests/licensedcode/data/tokenize/htmlish.txt.expected.query_lines.json +++ b/tests/licensedcode/data/tokenize/htmlish.txt.expected.query_lines.json @@ -5,7 +5,7 @@ ], [ 2, - "

MySQL

" + "

{{MySQL}}

" ], [ 3, diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index 7f1c67cd47c..c49f86496be 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -9,11 +9,13 @@ # import os +import pytest from commoncode.testcase import FileBasedTesting from licensedcode import cache from licensedcode import index from licensedcode.index import LicenseIndex from licensedcode.match import filter_contained_matches +from licensedcode.match import filter_key_phrase_spans from licensedcode.match import filter_overlapping_matches from licensedcode.match import get_full_matched_text from licensedcode.match import LicenseMatch @@ -25,6 +27,7 @@ from licensedcode import models from licensedcode.models import Rule from licensedcode.models import load_rules +from licensedcode.query import Query from licensedcode.spans import Span TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -252,6 +255,102 @@ def test_LicenseMatch_score_is_not_100_with_aho_match_and_extra_unknown_token_ah match = idx.match(query_string=querys)[0] assert match.score() < 100 + def test_LicenseMatch_matches_only_when_all_key_phrases_are_present(self): + text_r1 = ( + 'License ' + 'Distributed under the {{MIT License}}. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.') + r1 = Rule(text_file='r1', license_expression='mit', stored_text=text_r1) + idx = index.LicenseIndex([r1]) + + querys = ( + 'License ' + 'Distributed under the Apache License. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.') + + matches = idx.match(query_string=querys) + assert len(matches) == 0 + + def test_LicenseMatch_matches_only_when_all_key_phrases_are_present_in_order(self): + text_r1 = ( + 'License ' + 'Distributed under the {{MIT License}}. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.') + r1 = Rule(text_file='r1', license_expression='mit', stored_text=text_r1) + idx = index.LicenseIndex([r1]) + + querys = ( + 'License ' + 'Distributed under the License MIT. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.') + + matches = idx.match(query_string=querys) + assert len(matches) == 0 + + def test_LicenseMatch_matches_only_when_key_phrases_are_uninterrupted(self): + text_r1 = ( + 'License ' + 'Distributed under the {{MIT License}}. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.') + r1 = Rule(text_file='r1', license_expression='mit', stored_text=text_r1) + idx = index.LicenseIndex([r1]) + + querys = ( + 'See LICENSE for more information, and also you can redistribute this file under this or any other license.' + 'License ' + 'Distributed under the MIT, Version 2 License. See LICENSE or website for more information.' + 'You can redistribute this file under this or any other license.' + ) + + matches = idx.match(query_string=querys) + assert len(matches) == 0 + + def test_LicenseMatch_matches_aho_with_exact_match(self): + text_r1 = ( + 'License ' + 'Distributed under the {{MIT License}}. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.') + r1 = Rule(text_file='r1', license_expression='mit', stored_text=text_r1) + idx = index.LicenseIndex([r1]) + + querys = ( + 'License ' + 'Distributed under the MIT License. See LICENSE for more information.' + 'You can redistribute this file under this or any other license.' + ) + + matches = idx.match(query_string=querys, _skip_hash_match=True) + assert len(matches) == 1 + + def test_LicenseMatch_matches_only_when_key_phrase_is_uninterrupted(self): + text_r1 = ( + 'under the {{Creative Commons Attribution 4.0 International License}} (the "License");' + 'you may not use this file except in compliance with the License.' + 'You may obtain a copy of the License at' + '' + ' http://creativecommons.org/licenses/by/4.0' + '' + 'This file is distributed on an "AS IS" BASIS,' + 'WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.' + 'See the License for the specific language governing permissions and' + 'limitations under the License.' + ) + r1 = Rule(text_file='r1', license_expression='mit', stored_text=text_r1) + idx = index.LicenseIndex([r1]) + + querys = """ + This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 + International License (the "License"). You may not use this file except in compliance with the + License. A copy of the License is located at http://creativecommons.org/licenses/by-nc-sa/4.0/. + + This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + either express or implied. See the License for the specific language governing permissions and + limitations under the License. + """ + + matches = idx.match(query_string=querys, _skip_hash_match=True) + assert len(matches) == 0 + class TestMergeMatches(FileBasedTesting): test_data_dir = TEST_DATA_DIR @@ -796,6 +895,82 @@ def test_filter_overlapping_matches_matches_filters_matches_does_not_discard_non assert result == [m1] assert discarded == [m2] + def test_filter_key_phrases_keeps_matches_where_key_phrase_spans_is_fully_container_in_ispan(self): + idx = index.LicenseIndex() + query = Query(query_string="Lorum ipsum", idx=idx) + + r1 = Rule(text_file='r1', license_expression='apache-1.1', key_phrase_spans=[Span(2, 4)]) + + match_key_phrase_fully_contained = LicenseMatch(rule=r1, query=query, qspan=Span(0, 5), ispan=Span(0, 5)) + match_key_phrase_fully_outside = LicenseMatch(rule=r1, query=query, qspan=Span(5, 8), ispan=Span(5, 8)) + match_key_phrase_partially_contained = LicenseMatch(rule=r1, query=query, qspan=Span(0, 3), ispan=Span(0, 2)) + match_key_phrase_fully_containing = LicenseMatch(rule=r1, query=query, qspan=Span(3), ispan=Span(3)) + + kept, discarded = filter_key_phrase_spans([ + match_key_phrase_fully_contained, + match_key_phrase_fully_outside, + match_key_phrase_partially_contained, + match_key_phrase_fully_containing + ]) + assert kept == [ + match_key_phrase_fully_contained + ] + assert discarded == [ + match_key_phrase_fully_outside, + match_key_phrase_partially_contained, + match_key_phrase_fully_containing + ] + + def test_filter_key_phrases_discards_matches_where_qspan_intersects_with_unknown_or_stopwords(self): + idx = index.LicenseIndex() + query = Query(query_string="Lorum ipsum", idx=idx) + query.unknowns_by_pos = {12: 1} + query.stopwords_by_pos = {23: 1} + + r1 = Rule(text_file='r1', license_expression='apache-1.1', key_phrase_spans=[Span(2, 4)]) + + match_key_phrase_fully_contained = LicenseMatch(rule=r1, query=query, qspan=Span(0, 5), ispan=Span(0, 5)) + match_qspan_intersects_with_unknowns = LicenseMatch(rule=r1, query=query, qspan=Span(10, 15), ispan=Span(0, 5)) + match_qspan_intersects_with_stopwords = LicenseMatch(rule=r1, query=query, qspan=Span(20, 25), ispan=Span(0, 5)) + + kept, discarded = filter_key_phrase_spans([ + match_key_phrase_fully_contained, + match_qspan_intersects_with_unknowns, + match_qspan_intersects_with_stopwords, + ]) + assert kept == [ + match_key_phrase_fully_contained + ] + assert discarded == [ + match_qspan_intersects_with_unknowns, + match_qspan_intersects_with_stopwords + ] + + def test_filter_key_phrases_discards_matches_where_key_phrase_is_interruped_in_qspan(self): + idx = index.LicenseIndex() + query = Query(query_string="Lorum ipsum", idx=idx) + query.unknowns_by_pos = {} + query.stopwords_by_pos = {} + + r1 = Rule(text_file='r1', license_expression='apache-1.1', key_phrase_spans=[Span(12, 14)]) + + match_qspan_ispan_same_matching = LicenseMatch(rule=r1, query=query, qspan=Span(10, 15), ispan=Span(10, 15)) + match_qspan_with_offset_matching = LicenseMatch(rule=r1, query=query, qspan=Span(20, 25), ispan=Span(10, 15)) + match_qspan_with_offset_not_matching = LicenseMatch(rule=r1, query=query, qspan=Span([20, 21, 22, 23, 25]), ispan=Span(10, 15)) + + kept, discarded = filter_key_phrase_spans([ + match_qspan_ispan_same_matching, + match_qspan_with_offset_matching, + match_qspan_with_offset_not_matching + ]) + assert kept == [ + match_qspan_ispan_same_matching, + match_qspan_with_offset_matching + ] + assert discarded == [ + match_qspan_with_offset_not_matching, + ] + class TestLicenseMatchScore(FileBasedTesting): test_data_dir = TEST_DATA_DIR @@ -873,10 +1048,10 @@ class TestCollectLicenseMatchTexts(FileBasedTesting): def test_get_full_matched_text_base(self): rule_text = u''' - Copyright {{some copyright}} - THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS - IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE + Copyright [[some copyright]] + THIS IS FROM [[THE CODEHAUS]] AND CONTRIBUTORS + IN NO EVENT SHALL [[THE CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE ''' rule = Rule(stored_text=rule_text, license_expression='test') @@ -924,10 +1099,10 @@ def test_get_full_matched_text_base(self): def test_get_full_matched_text(self): rule_text = u''' - Copyright {{some copyright}} - THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS - IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE + Copyright [[some copyright]] + THIS IS FROM [[THE CODEHAUS]] AND CONTRIBUTORS + IN NO EVENT SHALL [[THE CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE [[POSSIBILITY OF SUCH]] DAMAGE ''' rule = Rule(stored_text=rule_text, license_expression='test') diff --git a/tests/licensedcode/test_match_seq.py b/tests/licensedcode/test_match_seq.py index e034cfe930c..a6217be7649 100644 --- a/tests/licensedcode/test_match_seq.py +++ b/tests/licensedcode/test_match_seq.py @@ -30,9 +30,9 @@ def test_match_template_with_few_tokens_around_gaps_is_wholly_seq_matched(self): rule_text = u''' Copyright - THIS IS FROM {{THE OLD CODEHAUS}} AND CONTRIBUTORS - IN NO EVENT SHALL {{THE OLD CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE - EVEN IF ADVISED OF THE {{POSSIBILITY OF NEW SUCH}} DAMAGE + THIS IS FROM [[THE OLD CODEHAUS]] AND CONTRIBUTORS + IN NO EVENT SHALL [[THE OLD CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE + EVEN IF ADVISED OF THE [[POSSIBILITY OF NEW SUCH]] DAMAGE ''' rule = Rule(stored_text=rule_text, license_expression='test') diff --git a/tests/licensedcode/test_models.py b/tests/licensedcode/test_models.py index 155d6a55cb2..ee59a4245ae 100644 --- a/tests/licensedcode/test_models.py +++ b/tests/licensedcode/test_models.py @@ -10,13 +10,18 @@ import json import os +import pytest + from commoncode.testcase import FileBasedTesting from licensedcode import cache from licensedcode import index from licensedcode import models +from licensedcode.models import get_key_phrases +from licensedcode.models import InvalidRule from licensedcode.models import Rule from licensedcode.models import rules_data_dir +from unittest import TestCase as TestCaseClass TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -534,3 +539,89 @@ def test_Rule__validate_with_false_positive_rule(self): rule_dir = self.get_test_loc('models/rule_validate') rule = list(models.load_rules(rule_dir))[0] assert list(rule.validate()) == [] + + def test_key_phrases_yields_spans(self): + rule_stored_text = ( + 'This released software is {{released}} by under {{the MIT license}}. ' + 'Which is a license originating at Massachusetts Institute of Technology (MIT).' + ) + rule = models.Rule(license_expression='mit', stored_text=rule_stored_text) + + key_phrases = rule.key_phrases() + + assert list(key_phrases) == [models.Span(4), models.Span(7, 9)] + + def test_key_phrases_raises_exception_when_markup_is_not_closed(self): + rule_stored_text = ( + 'This released software is {{released}} by under {{the MIT license. ' + 'Which is a license originating at Massachusetts Institute of Technology (MIT).' + ) + rule = models.Rule(license_expression='mit', stored_text=rule_stored_text) + + actual_exception = None + try: + list(rule.key_phrases()) + except Exception as e: + actual_exception = e + + assert isinstance(actual_exception, InvalidRule) + assert "Key phrase definition started at token '7' is not closed" == str(actual_exception) + + +class TestGetKeyPhrases(TestCaseClass): + def test_get_key_phrases_yields_spans(self): + text = ( + 'This released software is {{released}} by under {{the MIT license}}. ' + 'Which is a license originating at Massachusetts Institute of Technology (MIT).' + ) + + key_phrases = get_key_phrases(text) + + assert list(key_phrases) == [models.Span(4), models.Span(7, 9)] + + def test_get_key_phrases_raises_exception_key_phrase_markup_is_not_closed(self): + text = 'This software is {{released by under the MIT license.' + + actual_exception = None + try: + list(get_key_phrases(text)) + except Exception as e: + actual_exception = e + + assert isinstance(actual_exception, InvalidRule) + assert "Key phrase definition started at token '3' is not closed" == str(actual_exception) + + def test_get_key_phrases_ignores_stopwords_in_positions(self): + text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.' + + key_phrases = get_key_phrases(text) + + assert list(key_phrases) == [models.Span(11, 12)] + + def test_get_key_phrases_yields_spans_without_stop_words(self): + text = 'This released software is {{released span}} by under {{the MIT quot license}}.' + + key_phrases = get_key_phrases(text) + + assert list(key_phrases) == [models.Span(4), models.Span(7, 9)] + + def test_get_key_phrases_does_not_yield_empty_spans(self): + text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.' + + key_phrases = get_key_phrases(text) + + assert list(key_phrases) == [models.Span(6, 8)] + + def test_get_key_phrases_only_considers_outer_key_phrase_markup(self): + text = 'This released {{{software under the MIT}}} license.' + + key_phrases = get_key_phrases(text) + + assert list(key_phrases) == [models.Span(2, 5)] + + def test_get_key_phrases_ignores_nested_key_phrase_markup(self): + text = 'This released {{software {{under the}} MIT}} license.' + + key_phrases = get_key_phrases(text) + + assert list(key_phrases) == [models.Span(2, 5)] diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py index bf6d34cfdbf..17b2ca563e9 100644 --- a/tests/licensedcode/test_query.py +++ b/tests/licensedcode/test_query.py @@ -9,6 +9,7 @@ import json import os +from collections import defaultdict from commoncode.testcase import FileBasedTesting from licensedcode import cache @@ -395,6 +396,31 @@ def test_query_run_unknowns(self): expected = {-1: 2, 0: 4, 1: 3} assert dict(q.unknowns_by_pos) == expected + def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query(self): + idx = index.LicenseIndex( + [Rule(stored_text='a is the binary')], + _legalese=set(['binary']), + _spdx_tokens=set() + ) + q = Query(query_string='binary that was a binary', idx=idx) + list(q.tokens_by_line()) + assert q.unknowns_by_pos == {0: 2} + assert q.stopwords_by_pos == {0: 1} + + assert not isinstance(q.unknowns_by_pos, defaultdict) + assert not isinstance(q.stopwords_by_pos, defaultdict) + + try: + q.unknowns_by_pos[1] + assert q.unknowns_by_pos == {0: 2} + except KeyError: + pass + try: + q.stopwords_by_pos[1] + assert q.stopwords_by_pos == {0: 1} + except KeyError: + pass + class TestQueryWithMultipleRuns(IndexTesting): @@ -768,3 +794,52 @@ def test_query_run_for_text_with_long_lines(self): idx = cache.get_index() assert len(Query(location1, idx=idx).query_runs) == 17 assert len(Query(location2, idx=idx).query_runs) == 15 + + def test_match_does_not_change_query_unknown_positions(self): + from licensedcode.match import LicenseMatch + from licensedcode.spans import Span + + location = self.get_test_loc('query/unknown_positions/lz4.license.txt') + idx = cache.get_index() + # build a query first + qry1 = Query(location, idx=idx) + # this has the side effect to populate the unknown + txt = u' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens)) + assert txt == ( + '0-this 1-repository 2-uses 3-2 4-different 5-licenses ' + '6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license ' + '17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise ' + '27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file ' + '38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories ' + '49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that ' + '56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications ' + '70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples ' + '79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario' + ) + list(qry1.tokens_by_line()) + assert qry1.unknowns_by_pos == {} + + # run matching + matches = idx.match(location=location) + match = matches[0] + + rule = [ + r for r in idx.rules_by_rid + if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE' + ][0] + + expected = LicenseMatch( + matcher='2-aho', + rule=rule, + qspan=Span(0, 48), + ispan=Span(0, 48), + ) + + assert match == expected + + # check that query unknown by pos is the same and empty + qry2 = match.query + + # this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0} + # after querying done during matching + assert qry2.unknowns_by_pos == {} diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py index 3b11e942bc4..a84842ba639 100644 --- a/tests/licensedcode/test_tokenize.py +++ b/tests/licensedcode/test_tokenize.py @@ -16,6 +16,7 @@ from commoncode.testcase import FileBasedTesting from licensedcode.tokenize import index_tokenizer +from licensedcode.tokenize import key_phrase_tokenizer from licensedcode.tokenize import matched_query_text_tokenizer from licensedcode.tokenize import query_lines from licensedcode.tokenize import query_tokenizer @@ -394,6 +395,102 @@ def test_index_tokenizer_lines_on_html_like_texts_2(self, regen=False): result = [list(index_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen) + def test_key_phrase_tokenizer_on_html_like_texts(self, regen=False): + test_file = self.get_test_loc('tokenize/htmlish.txt') + expected_file = test_file + '.expected.key_phrase_tokenizer.json' + lines = query_lines(test_file) + result = [list(key_phrase_tokenizer(line)) for _ln, line in lines] + check_results(result, expected_file, regen=regen) + + def test_key_phrase_tokenizer_lines_on_html_like_texts_2(self, regen=False): + test_file = self.get_test_loc('tokenize/htmlish.html') + expected_file = test_file + '.expected.key_phrase_tokenizer.json' + lines = query_lines(test_file) + result = [list(key_phrase_tokenizer(line)) for _ln, line in lines] + check_results(result, expected_file, regen=regen) + + def test_key_phrase_tokenizer_handles_empty_string(self): + text = '' + result = list(key_phrase_tokenizer(text)) + assert result == [] + + def test_key_phrase_tokenizer_handles_blank_lines(self): + text = u' \n\n\t ' + result = list(key_phrase_tokenizer(text)) + assert result == [] + + def test_key_phrase_tokenizer_handles_blank_lines2(self): + text = ' \n\t ' + result = list(key_phrase_tokenizer(text)) + assert result == [] + + def test_key_phrase_tokenizer_handles_empty_lines(self): + text = u'\n\n' + expected = [] + assert list(key_phrase_tokenizer(text)) == expected + + def test_key_phrase_tokenizer_does_not_crash_on_unicode_rules_text_1(self): + test_file = self.get_test_loc('tokenize/unicode/12290.txt') + with io.open(test_file, encoding='utf-8') as test: + list(key_phrase_tokenizer(test.read())) + + def test_key_phrase_does_not_crash_on_unicode_rules_text_2(self): + test_file = self.get_test_loc('tokenize/unicode/12319.txt') + with io.open(test_file, encoding='utf-8') as test: + list(key_phrase_tokenizer(test.read())) + + def test_key_phrase_does_not_crash_on_unicode_rules_text_3(self): + test_file = self.get_test_loc('tokenize/unicode/12405.txt') + with io.open(test_file, encoding='utf-8') as test: + list(key_phrase_tokenizer(test.read())) + + def test_key_phrase_does_not_crash_on_unicode_rules_text_4(self): + test_file = self.get_test_loc('tokenize/unicode/12407.txt') + with io.open(test_file, encoding='utf-8') as test: + list(key_phrase_tokenizer(test.read())) + + def test_key_phrase_does_not_crash_on_unicode_rules_text_5(self): + test_file = self.get_test_loc('tokenize/unicode/12420.txt') + with io.open(test_file, encoding='utf-8') as test: + list(key_phrase_tokenizer(test.read())) + + def test_key_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self): + """ + It is important that the `key_phrase_tokenizer` returns the same amount of tokens (excluding key_phrase markup) + as the `index_tokenizer` so that they Span positions derived from the tokens line up. + """ + text = 'Redistribution \n\n comma and use in \n\t binary \xe4r till\xe5tet.' + key_phrase_tokens = key_phrase_tokenizer(text) + index_tokens = index_tokenizer(text) + assert list(key_phrase_tokens) == list(index_tokens) + + def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_multiple_token_key_phrases(self): + text = 'Redistribution and {{use in binary}} is permitted.' + assert list(key_phrase_tokenizer(text)) == ['redistribution', 'and', '{{', 'use', 'in', 'binary', '}}', 'is', + 'permitted'] + + def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_after_newline(self): + text = '{{IS_RIGHT\nThis program is distributed under GPL\n}}IS_RIGHT' + assert list(key_phrase_tokenizer(text)) == ['{{', 'is', 'right', 'this', 'program', 'is', 'distributed', 'under', 'gpl', '}}', 'is', 'right'] + + def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_when_separated_by_space(self): + text = 'Redistribution {{ is }} permitted.' + assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '}}', 'permitted'] + + def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_single_token_key_phrase(self): + text = 'Redistribution {{is}} permitted.' + assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '}}', 'permitted'] + + def test_key_phrase_tokenizer_returns_nested_key_phrase_markup_as_tokens(self): + text = 'Redistribution {{is {{not}} really}} permitted.' + assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '{{', 'not', '}}', 'really', '}}', + 'permitted'] + + def test_key_phrase_tokenizer_ignores_invalid_key_phrase_markup(self): + text = 'Redistribution {{{is not really}}} { {permitted} }, I am {afraid}.' + assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', 'not', 'really', '}}', 'permitted', + 'i', 'am', 'afraid'] + class TestNgrams(FileBasedTesting): test_data_dir = TEST_DATA_DIR