From f3f2c780418c5ebf5f2ae409114e480668dfaed8 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 7 Jun 2024 14:40:43 +0200 Subject: [PATCH] Correctly filter copyrights in licenses #3797 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference: https://github.com/nexB/scancode-toolkit/issues/3797 Reported-by: Jörg Arndt @Joerki Signed-off-by: Philippe Ombredanne --- src/cluecode/plugin_filter_clues.py | 102 +++-- .../data/licenses/ricebsd.LICENSE | 2 +- ...complex_4_line_statement_in_text-9.txt.yml | 6 +- ...h_lead_copy_sign_and_debian_s_tags.txt.yml | 6 +- .../data/plugin_filter_clues/files/LICENSE4 | 379 ++++++++++++++++++ .../filtered-expected.json | 30 -- .../filtered-expected3.json | 50 +-- .../filtered-expected4.json | 340 ++++++++++++++++ tests/cluecode/test_plugin_filter_clues.py | 16 +- 9 files changed, 810 insertions(+), 121 deletions(-) create mode 100644 tests/cluecode/data/plugin_filter_clues/files/LICENSE4 create mode 100644 tests/cluecode/data/plugin_filter_clues/filtered-expected4.json diff --git a/src/cluecode/plugin_filter_clues.py b/src/cluecode/plugin_filter_clues.py index 987d30c69ec..a00197fc6c8 100644 --- a/src/cluecode/plugin_filter_clues.py +++ b/src/cluecode/plugin_filter_clues.py @@ -7,6 +7,12 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +""" +Filter out or ignore, as in "remove" redundant or irrelevant detected clues such as copyrights, +authors, emails, and urls that are already contained in a matched license text or license rule and +treated as ignorable. +""" + from itertools import chain import attr @@ -63,22 +69,24 @@ def process_codebase(self, codebase, **kwargs): if TRACE: logger_debug('RedundantFilter:process_codebase') from licensedcode.cache import get_index + rules_by_id = get_index().rules_by_id for resource in codebase.walk(): - filtered = filter_ignorable_resource_clues(resource, get_index().rules_by_id) + filtered = filter_ignorable_resource_clues(resource=resource, rules_by_id=rules_by_id) if filtered: filtered.save(codebase) def filter_ignorable_resource_clues(resource, rules_by_id): """ - Filter ignorable clues from the `resource` Resource objects using all the - scan details attached to that `resource` and the `rules_by_id` mapping of - {identifier: license Rule object}. Return the `resource` object modified in- - place if it was modified. + Filter ignorable clues from the ``resource`` Resource object using all the + scan details attached to that ``resource`` and the ``rules_by_id`` mapping of + {identifier: license Rule object}. Return the ``resource`` object modified in- + place if it was modified, or None otherwise. """ detections = Detections.from_resource(resource) - filtered = filter_ignorable_clues(detections, rules_by_id) + filtered = filter_ignorable_clues(detections=detections, rules_by_id=rules_by_id) + logger_debug(f'filter_ignorable_resource_clues: {filtered}') if filtered: if hasattr(resource, 'emails'): resource.emails = filtered.emails @@ -97,8 +105,7 @@ def filter_ignorable_resource_clues(resource, rules_by_id): class Ignorable(object): # a frozenset of matched line numbers lines_range = attr.ib() - # either a string or a frozenset of strings, such that we can test for `x in - # value` + # either a string or a frozenset of strings, such that we can test for `x in value` value = attr.ib() @@ -119,20 +126,22 @@ class Detections(object): urls = attr.ib(default=attr.Factory(list)) emails = attr.ib(default=attr.Factory(list)) - licenses = attr.ib(default=attr.Factory(list)) + license_matches = attr.ib(default=attr.Factory(list)) # this is the same as author and copyrights, but restructured to be in the # same format as ignorables and is used to filter emails and urls in authors # and copyright - copyrights_as_ignorable = attr.ib(default=attr.Factory(list), repr=False) - holders_as_ignorable = attr.ib(default=attr.Factory(list), repr=False) - authors_as_ignorable = attr.ib(default=attr.Factory(list), repr=False) + copyrights_as_ignorable = attr.ib(default=attr.Factory(list)) + holders_as_ignorable = attr.ib(default=attr.Factory(list)) + authors_as_ignorable = attr.ib(default=attr.Factory(list)) @staticmethod def from_scan_data(data): detected_copyrights = data.get('copyrights', []) detected_authors = data.get('authors', []) detected_holders = data.get('holders', []) + detected_emails = data.get('emails', []) + detected_urls = data.get('urls', []) copyrights_as_ignorable = frozenset( Ignorable( @@ -155,19 +164,23 @@ def from_scan_data(data): for a in detected_authors ) - return Detections( + license_matches = list(chain.from_iterable(d['matches'] for d in data['license_detections'])) + + detections = Detections( copyrights=detected_copyrights, - emails=data.get('emails', []), - urls=data.get('urls', []), + emails=detected_emails, + urls=detected_urls, holders=detected_holders, authors=detected_authors, - authors_as_ignorable=authors_as_ignorable, copyrights_as_ignorable=copyrights_as_ignorable, holders_as_ignorable=holders_as_ignorable, + authors_as_ignorable=authors_as_ignorable, - licenses=data.get('licenses', []), + license_matches=license_matches, ) + detections.debug() + return detections @staticmethod def from_resource(resource): @@ -185,11 +198,21 @@ def as_iterable(self): (('url', c) for c in self.urls), ) + def debug(self): + if TRACE: + logger_debug('Detections') + for nv in self.as_iterable(): + logger_debug(' ', nv), + + logger_debug(' copyrights_as_ignorable:', self.copyrights_as_ignorable) + logger_debug(' holders_as_ignorable: ', self.holders_as_ignorable) + logger_debug(' authors_as_ignorable: ', self.authors_as_ignorable) + logger_debug(' license_matches: ', self.license_matches) + def is_empty(clues): if clues: - return not any([ - clues.copyrights, clues.holders, clues.authors, clues.urls, clues.emails]) + return not any([clues.copyrights, clues.holders, clues.authors, clues.urls, clues.emails]) else: # The logic is reversed, so a false or None "clues" object returns None, which # is interpreted as False (i.e., the object is *not* empty). @@ -204,18 +227,22 @@ def filter_ignorable_clues(detections, rules_by_id): """ if is_empty(detections): return + if TRACE: + logger_debug('filter_ignorable_clues: detections') + detections.debug() no_detected_ignorables = not detections.copyrights and not detections.authors - ignorables = collect_ignorables(detections.licenses, rules_by_id) - - no_ignorables = not detections.licenses or is_empty(ignorables) + ignorables = collect_ignorables(license_matches=detections.license_matches, rules_by_id=rules_by_id) + no_ignorables = not detections.license_matches or is_empty(ignorables) if TRACE: logger_debug('ignorables', ignorables) # logger_debug('detections', detections) if no_ignorables and no_detected_ignorables: + if TRACE: + logger_debug('filter_ignorable_clues: NO IGNORABLES') return # discard redundant emails if ignorable or in a detections copyright or author @@ -307,9 +334,9 @@ def filter_values(attributes, ignorables, value_key='copyright', strip=''): def collect_ignorables(license_matches, rules_by_id): """ - Collect and return an Ignorables object built from ``license_matches`` - matched licenses list of "licenses" objects returned in ScanCode JSON - results and the ``rules_by_id`` mapping of Rule objects by identifier. + Collect and return an Ignorables object built from ``license_matches`` list of license matches + as returned in ScanCode results license_detection and the ``rules_by_id`` mapping of Rule + objects by rule identifier. The value of each ignorable list of clues is a set of (set of lines number, set of ignorable values). @@ -321,6 +348,8 @@ def collect_ignorables(license_matches, rules_by_id): copyrights = set() if not license_matches: + if TRACE: + logger_debug('collect_ignorables: No ignorables!!!!') return Ignorables( copyrights=frozenset(copyrights), holders=frozenset(holders), @@ -328,31 +357,30 @@ def collect_ignorables(license_matches, rules_by_id): urls=frozenset(urls), emails=frozenset(emails), ) - # build tuple of (set of lines number, set of ignorbale values) - for lic in license_matches: + + # build tuple of (set of lines number, set of ignorable values) + for licmat in license_matches: if TRACE: - logger_debug('collect_ignorables: license:', lic['key'], lic['score']) + logger_debug('collect_ignorables: license_match:', licmat['license_expression'], licmat['score']) - matched_rule = lic.get('matched_rule', {}) - rid = matched_rule.get('identifier') - match_coverage = matched_rule.get('match_coverage', 0) + rid = licmat['rule_identifier'] + if not rid: + # we are missing the license match details, we can only skip + if TRACE: logger_debug(' collect_ignorables: skipping, no RID') + continue # ignore poor partial matches # TODO: there must be a better way using coverage + match_coverage = float(licmat['match_coverage']) if match_coverage < 90: if TRACE: logger_debug(' collect_ignorables: skipping, match_coverage under 90%') continue - if not rid: - # we are missing the license match details, we can only skip - if TRACE: logger_debug(' collect_ignorables: skipping, no RID') - continue - rule = rules_by_id[rid] - lines_range = frozenset(range(lic['start_line'], lic['end_line'] + 1)) + lines_range = frozenset(range(licmat['start_line'], licmat['end_line'] + 1)) ign_copyrights = frozenset(rule.ignorable_copyrights or []) if ign_copyrights: diff --git a/src/licensedcode/data/licenses/ricebsd.LICENSE b/src/licensedcode/data/licenses/ricebsd.LICENSE index f0e58c31b1b..ab8564a7361 100644 --- a/src/licensedcode/data/licenses/ricebsd.LICENSE +++ b/src/licensedcode/data/licenses/ricebsd.LICENSE @@ -11,7 +11,7 @@ other_urls: - https://github.com/search?q="Also%2C+we+ask+that+use+of+ARPACK+is+properly"&type=code have this ignorable_copyrights: - - (c) 2001, Rice University + - Copyright (c) 2001, Rice University ignorable_holders: - Rice University ignorable_authors: diff --git a/tests/cluecode/data/copyrights/complex_4_line_statement_in_text-9.txt.yml b/tests/cluecode/data/copyrights/complex_4_line_statement_in_text-9.txt.yml index fabf87f1aaa..238642b4db9 100644 --- a/tests/cluecode/data/copyrights/complex_4_line_statement_in_text-9.txt.yml +++ b/tests/cluecode/data/copyrights/complex_4_line_statement_in_text-9.txt.yml @@ -4,13 +4,13 @@ what: - holders_summary copyrights: - Copyright 2002 Jonas Borgstrom 2002 Daniel Lundin - 2002 CodeFactory AB + 2002 CodeFactory AB. - Copyright (c) 1994 The Regents of the University of California holders: - - Jonas Borgstrom Daniel Lundin CodeFactory AB + - Jonas Borgstrom Daniel Lundin CodeFactory AB. - The Regents of the University of California holders_summary: - - value: Jonas Borgstrom Daniel Lundin CodeFactory AB + - value: Jonas Borgstrom Daniel Lundin CodeFactory AB. count: 1 - value: The Regents of the University of California count: 1 diff --git a/tests/cluecode/data/copyrights/copytest/with_lead_copy_sign_and_debian_s_tags.txt.yml b/tests/cluecode/data/copyrights/copytest/with_lead_copy_sign_and_debian_s_tags.txt.yml index f56053e922e..23564906910 100644 --- a/tests/cluecode/data/copyrights/copytest/with_lead_copy_sign_and_debian_s_tags.txt.yml +++ b/tests/cluecode/data/copyrights/copytest/with_lead_copy_sign_and_debian_s_tags.txt.yml @@ -4,12 +4,12 @@ what: - holders_summary copyrights: - Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies) - - (c) 1994-2008 Trolltech ASA + - (c) 1994-2008 Trolltech ASA. holders: - Nokia Corporation and/or its subsidiary(-ies) - - Trolltech ASA + - Trolltech ASA. holders_summary: - value: Nokia Corporation and/or its subsidiary(-ies) count: 1 - - value: Trolltech ASA + - value: Trolltech ASA. count: 1 diff --git a/tests/cluecode/data/plugin_filter_clues/files/LICENSE4 b/tests/cluecode/data/plugin_filter_clues/files/LICENSE4 new file mode 100644 index 00000000000..e7c0b65bcdb --- /dev/null +++ b/tests/cluecode/data/plugin_filter_clues/files/LICENSE4 @@ -0,0 +1,379 @@ +This software is copyright (c) 2013 by Mark Jason Dominus . + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +Terms of the Perl programming language system itself + +a) the GNU General Public License as published by the Free + Software Foundation; either version 1, or (at your option) any + later version, or +b) the "Artistic License" + +--- The GNU General Public License, Version 1, February 1989 --- + +This software is Copyright (c) 2013 by Mark Jason Dominus . + +This is free software, licensed under: + + The GNU General Public License, Version 1, February 1989 + + GNU GENERAL PUBLIC LICENSE + Version 1, February 1989 + + Copyright (C) 1989 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The license agreements of most software companies try to keep users +at the mercy of those companies. By contrast, our General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. The +General Public License applies to the Free Software Foundation's +software and to any other program whose authors commit to using it. +You can use it for your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Specifically, the General Public License is designed to make +sure that you have the freedom to give away or sell copies of free +software, that you receive source code or can get it if you want it, +that you can change the software or use pieces of it in new free +programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of a such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must tell them their rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any program or other work which +contains a notice placed by the copyright holder saying it may be +distributed under the terms of this General Public License. The +"Program", below, refers to any such program or work, and a "work based +on the Program" means either the Program or any work containing the +Program or a portion of it, either verbatim or with modifications. Each +licensee is addressed as "you". + + 1. You may copy and distribute verbatim copies of the Program's source +code as you receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice and +disclaimer of warranty; keep intact all the notices that refer to this +General Public License and to the absence of any warranty; and give any +other recipients of the Program a copy of this General Public License +along with the Program. You may charge a fee for the physical act of +transferring a copy. + + 2. You may modify your copy or copies of the Program or any portion of +it, and copy and distribute such modifications under the terms of Paragraph +1 above, provided that you also do the following: + + a) cause the modified files to carry prominent notices stating that + you changed the files and the date of any change; and + + b) cause the whole of any work that you distribute or publish, that + in whole or in part contains the Program or any part thereof, either + with or without modifications, to be licensed at no charge to all + third parties under the terms of this General Public License (except + that you may choose to grant warranty protection to some or all + third parties, at your option). + + c) If the modified program normally reads commands interactively when + run, you must cause it, when started running for such interactive use + in the simplest and most usual way, to print or display an + announcement including an appropriate copyright notice and a notice + that there is no warranty (or else, saying that you provide a + warranty) and that users may redistribute the program under these + conditions, and telling the user how to view a copy of this General + Public License. + + d) You may charge a fee for the physical act of transferring a + copy, and you may at your option offer warranty protection in + exchange for a fee. + +Mere aggregation of another independent work with the Program (or its +derivative) on a volume of a storage or distribution medium does not bring +the other work under the scope of these terms. + + 3. You may copy and distribute the Program (or a portion or derivative of +it, under Paragraph 2) in object code or executable form under the terms of +Paragraphs 1 and 2 above provided that you also do one of the following: + + a) accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of + Paragraphs 1 and 2 above; or, + + b) accompany it with a written offer, valid for at least three + years, to give any third party free (except for a nominal charge + for the cost of distribution) a complete machine-readable copy of the + corresponding source code, to be distributed under the terms of + Paragraphs 1 and 2 above; or, + + c) accompany it with the information you received as to where the + corresponding source code may be obtained. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form alone.) + +Source code for a work means the preferred form of the work for making +modifications to it. For an executable file, complete source code means +all the source code for all modules it contains; but, as a special +exception, it need not include source code for modules which are standard +libraries that accompany the operating system on which the executable +file runs, or for standard header files or definitions files that +accompany that operating system. + + 4. You may not copy, modify, sublicense, distribute or transfer the +Program except as expressly provided under this General Public License. +Any attempt otherwise to copy, modify, sublicense, distribute or transfer +the Program is void, and will automatically terminate your rights to use +the Program under this License. However, parties who have received +copies, or rights to use copies, from you under this General Public +License will not have their licenses terminated so long as such parties +remain in full compliance. + + 5. By copying, distributing or modifying the Program (or any work based +on the Program) you indicate your acceptance of this license to do so, +and all its terms and conditions. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the original +licensor to copy, distribute or modify the Program subject to these +terms and conditions. You may not impose any further restrictions on the +recipients' exercise of the rights granted herein. + + 7. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of the license which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +the license, you may choose any version ever published by the Free Software +Foundation. + + 8. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to humanity, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + + To do so, attach the following notices to the program. It is safest to +attach them to the start of each source file to most effectively convey +the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 1, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19xx name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the +appropriate parts of the General Public License. Of course, the +commands you use may be called something other than `show w' and `show +c'; they could even be mouse-clicks or menu items--whatever suits your +program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + program `Gnomovision' (a program to direct compilers to make passes + at assemblers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +That's all there is to it! + + +--- The Artistic License 1.0 --- + +This software is Copyright (c) 2013 by Mark Jason Dominus . + +This is free software, licensed under: + + The Artistic License 1.0 + +The Artistic License + +Preamble + +The intent of this document is to state the conditions under which a Package +may be copied, such that the Copyright Holder maintains some semblance of +artistic control over the development of the package, while giving the users of +the package the right to use and distribute the Package in a more-or-less +customary fashion, plus the right to make reasonable modifications. + +Definitions: + + - "Package" refers to the collection of files distributed by the Copyright + Holder, and derivatives of that collection of files created through + textual modification. + - "Standard Version" refers to such a Package if it has not been modified, + or has been modified in accordance with the wishes of the Copyright + Holder. + - "Copyright Holder" is whoever is named in the copyright or copyrights for + the package. + - "You" is you, if you're thinking about copying or distributing this Package. + - "Reasonable copying fee" is whatever you can justify on the basis of media + cost, duplication charges, time of people involved, and so on. (You will + not be required to justify it to the Copyright Holder, but only to the + computing community at large as a market that must bear the fee.) + - "Freely Available" means that no fee is charged for the item itself, though + there may be fees involved in handling the item. It also means that + recipients of the item may redistribute it under the same conditions they + received it. + +1. You may make and give away verbatim copies of the source form of the +Standard Version of this Package without restriction, provided that you +duplicate all of the original copyright notices and associated disclaimers. + +2. You may apply bug fixes, portability fixes and other modifications derived +from the Public Domain or from the Copyright Holder. A Package modified in such +a way shall still be considered the Standard Version. + +3. You may otherwise modify your copy of this Package in any way, provided that +you insert a prominent notice in each changed file stating how and when you +changed that file, and provided that you do at least ONE of the following: + + a) place your modifications in the Public Domain or otherwise make them + Freely Available, such as by posting said modifications to Usenet or an + equivalent medium, or placing the modifications on a major archive site + such as ftp.uu.net, or by allowing the Copyright Holder to include your + modifications in the Standard Version of the Package. + + b) use the modified Package only within your corporation or organization. + + c) rename any non-standard executables so the names do not conflict with + standard executables, which must also be provided, and provide a separate + manual page for each non-standard executable that clearly documents how it + differs from the Standard Version. + + d) make other distribution arrangements with the Copyright Holder. + +4. You may distribute the programs of this Package in object code or executable +form, provided that you do at least ONE of the following: + + a) distribute a Standard Version of the executables and library files, + together with instructions (in the manual page or equivalent) on where to + get the Standard Version. + + b) accompany the distribution with the machine-readable source of the Package + with your modifications. + + c) accompany any non-standard executables with their corresponding Standard + Version executables, giving the non-standard executables non-standard + names, and clearly documenting the differences in manual pages (or + equivalent), together with instructions on where to get the Standard + Version. + + d) make other distribution arrangements with the Copyright Holder. + +5. You may charge a reasonable copying fee for any distribution of this +Package. You may charge any fee you choose for support of this Package. You +may not charge a fee for this Package itself. However, you may distribute this +Package in aggregate with other (possibly commercial) programs as part of a +larger (possibly commercial) software distribution provided that you do not +advertise this Package as a product of your own. + +6. The scripts and library files supplied as input to or produced as output +from the programs of this Package do not automatically fall under the copyright +of this Package, but belong to whomever generated them, and may be sold +commercially, and may be aggregated with this Package. + +7. C or perl subroutines supplied by you and linked into this Package shall not +be considered part of this Package. + +8. The name of the Copyright Holder may not be used to endorse or promote +products derived from this software without specific prior written permission. + +9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +The End + diff --git a/tests/cluecode/data/plugin_filter_clues/filtered-expected.json b/tests/cluecode/data/plugin_filter_clues/filtered-expected.json index 4bfec66a9e4..e3193ec5301 100644 --- a/tests/cluecode/data/plugin_filter_clues/filtered-expected.json +++ b/tests/cluecode/data/plugin_filter_clues/filtered-expected.json @@ -76,20 +76,10 @@ "start_line": 2, "end_line": 3 }, - { - "copyright": "Copyright (c) The Apache Software Foundation", - "start_line": 14, - "end_line": 14 - }, { "copyright": "Copyright (c) The Eclipse Foundation https://eclipse.org", "start_line": 41, "end_line": 42 - }, - { - "copyright": "copyright (c) 1999, International Business Machines, Inc., http://www.ibm.com", - "start_line": 67, - "end_line": 68 } ], "holders": [ @@ -98,20 +88,10 @@ "start_line": 2, "end_line": 2 }, - { - "holder": "The Apache Software Foundation", - "start_line": 14, - "end_line": 14 - }, { "holder": "The Eclipse Foundation", "start_line": 41, "end_line": 41 - }, - { - "holder": "International Business Machines, Inc.", - "start_line": 67, - "end_line": 68 } ], "authors": [ @@ -120,11 +100,6 @@ "start_line": 5, "end_line": 5 }, - { - "author": "the Apache Software Foundation (http://www.apache.org/)", - "start_line": 31, - "end_line": 32 - }, { "author": "John Doe", "start_line": 44, @@ -136,11 +111,6 @@ "email": "foo@eclipse.org", "start_line": 4, "end_line": 4 - }, - { - "email": "apache@apache.org", - "start_line": 39, - "end_line": 39 } ], "urls": [], diff --git a/tests/cluecode/data/plugin_filter_clues/filtered-expected3.json b/tests/cluecode/data/plugin_filter_clues/filtered-expected3.json index 4b3c1211551..45102df52ad 100644 --- a/tests/cluecode/data/plugin_filter_clues/filtered-expected3.json +++ b/tests/cluecode/data/plugin_filter_clues/filtered-expected3.json @@ -70,51 +70,11 @@ ], "license_clues": [], "percentage_of_license_text": 100.0, - "copyrights": [ - { - "copyright": "Copyright (c) 1997-2001 University of Cambridge", - "start_line": 11, - "end_line": 11 - }, - { - "copyright": "copyright by the University of Cambridge, England", - "start_line": 26, - "end_line": 27 - } - ], - "holders": [ - { - "holder": "University of Cambridge", - "start_line": 11, - "end_line": 11 - }, - { - "holder": "the University of Cambridge, England", - "start_line": 26, - "end_line": 27 - } - ], - "authors": [ - { - "author": "Philip Hazel", - "start_line": 26, - "end_line": 26 - } - ], - "emails": [ - { - "email": "ph10@cam.ac.uk", - "start_line": 8, - "end_line": 8 - } - ], - "urls": [ - { - "url": "ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/", - "start_line": 33, - "end_line": 33 - } - ], + "copyrights": [], + "holders": [], + "authors": [], + "emails": [], + "urls": [], "files_count": 0, "dirs_count": 0, "size_count": 0, diff --git a/tests/cluecode/data/plugin_filter_clues/filtered-expected4.json b/tests/cluecode/data/plugin_filter_clues/filtered-expected4.json new file mode 100644 index 00000000000..cf493480449 --- /dev/null +++ b/tests/cluecode/data/plugin_filter_clues/filtered-expected4.json @@ -0,0 +1,340 @@ +{ + "license_detections": [ + { + "identifier": "gpl_1_0_plus_or_artistic_1_0__and_gpl_1_0_and_artistic_1_0_and_warranty_disclaimer-a1de207d-6b62-18b8-f372-e9820f733de3", + "license_expression": "(gpl-1.0-plus OR artistic-1.0) AND gpl-1.0 AND artistic-1.0 AND warranty-disclaimer", + "license_expression_spdx": "(GPL-1.0-or-later OR Artistic-1.0) AND GPL-1.0-only AND Artistic-1.0 AND LicenseRef-scancode-warranty-disclaimer", + "detection_count": 1, + "reference_matches": [ + { + "license_expression": "gpl-1.0-plus OR artistic-1.0", + "license_expression_spdx": "GPL-1.0-or-later OR Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 3, + "end_line": 11, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 59, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0-plus_or_artistic-1.0_2.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/gpl-1.0-plus_or_artistic-1.0_2.RULE" + }, + { + "license_expression": "gpl-1.0", + "license_expression_spdx": "GPL-1.0-only", + "from_file": "LICENSE4", + "start_line": 13, + "end_line": 13, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 9, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0_10.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/gpl-1.0_10.RULE" + }, + { + "license_expression": "gpl-1.0", + "license_expression_spdx": "GPL-1.0-only", + "from_file": "LICENSE4", + "start_line": 17, + "end_line": 19, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 15, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0_37.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/gpl-1.0_37.RULE" + }, + { + "license_expression": "gpl-1.0", + "license_expression_spdx": "GPL-1.0-only", + "from_file": "LICENSE4", + "start_line": 21, + "end_line": 270, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 2039, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0.LICENSE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/gpl-1.0.LICENSE" + }, + { + "license_expression": "artistic-1.0", + "license_expression_spdx": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 273, + "end_line": 273, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 5, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "artistic-1.0_9.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_9.RULE" + }, + { + "license_expression": "artistic-1.0", + "license_expression_spdx": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 277, + "end_line": 279, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 11, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "artistic-1.0_7.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_7.RULE" + }, + { + "license_expression": "artistic-1.0", + "license_expression_spdx": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 281, + "end_line": 281, + "matcher": "2-aho", + "score": 90.0, + "matched_length": 3, + "match_coverage": 100.0, + "rule_relevance": 90, + "rule_identifier": "artistic-1.0_11.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_11.RULE" + }, + { + "license_expression": "artistic-1.0", + "license_expression_spdx": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 283, + "end_line": 372, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 729, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "artistic-1.0_4.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_4.RULE" + }, + { + "license_expression": "warranty-disclaimer", + "license_expression_spdx": "LicenseRef-scancode-warranty-disclaimer", + "from_file": "LICENSE4", + "start_line": 374, + "end_line": 376, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 26, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "warranty-disclaimer_72.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/warranty-disclaimer_72.RULE" + } + ] + } + ], + "files": [ + { + "path": "LICENSE4", + "type": "file", + "name": "LICENSE4", + "base_name": "LICENSE4", + "extension": "", + "size": 18407, + "sha1": "8a2e40b4eb23cc05a0b78330d919a7ffacde7a9a", + "md5": "d5d03e14130735213e0532277df33cab", + "sha256": "e4bd79e88b577d66d351597c0d4114b9ee2e31f0544795e87a88ca1b0dd38383", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "detected_license_expression": "(gpl-1.0-plus OR artistic-1.0) AND gpl-1.0 AND artistic-1.0 AND warranty-disclaimer", + "detected_license_expression_spdx": "(GPL-1.0-or-later OR Artistic-1.0) AND GPL-1.0-only AND Artistic-1.0 AND LicenseRef-scancode-warranty-disclaimer", + "license_detections": [ + { + "license_expression": "(gpl-1.0-plus OR artistic-1.0) AND gpl-1.0 AND artistic-1.0 AND warranty-disclaimer", + "license_expression_spdx": "(GPL-1.0-or-later OR Artistic-1.0) AND GPL-1.0-only AND Artistic-1.0 AND LicenseRef-scancode-warranty-disclaimer", + "matches": [ + { + "license_expression": "gpl-1.0-plus OR artistic-1.0", + "spdx_license_expression": "GPL-1.0-or-later OR Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 3, + "end_line": 11, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 59, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0-plus_or_artistic-1.0_2.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/gpl-1.0-plus_or_artistic-1.0_2.RULE" + }, + { + "license_expression": "gpl-1.0", + "spdx_license_expression": "GPL-1.0-only", + "from_file": "LICENSE4", + "start_line": 13, + "end_line": 13, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 9, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0_10.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/gpl-1.0_10.RULE" + }, + { + "license_expression": "gpl-1.0", + "spdx_license_expression": "GPL-1.0-only", + "from_file": "LICENSE4", + "start_line": 17, + "end_line": 19, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 15, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0_37.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/gpl-1.0_37.RULE" + }, + { + "license_expression": "gpl-1.0", + "spdx_license_expression": "GPL-1.0-only", + "from_file": "LICENSE4", + "start_line": 21, + "end_line": 270, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 2039, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "gpl-1.0.LICENSE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/gpl-1.0.LICENSE" + }, + { + "license_expression": "artistic-1.0", + "spdx_license_expression": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 273, + "end_line": 273, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 5, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "artistic-1.0_9.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_9.RULE" + }, + { + "license_expression": "artistic-1.0", + "spdx_license_expression": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 277, + "end_line": 279, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 11, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "artistic-1.0_7.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_7.RULE" + }, + { + "license_expression": "artistic-1.0", + "spdx_license_expression": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 281, + "end_line": 281, + "matcher": "2-aho", + "score": 90.0, + "matched_length": 3, + "match_coverage": 100.0, + "rule_relevance": 90, + "rule_identifier": "artistic-1.0_11.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_11.RULE" + }, + { + "license_expression": "artistic-1.0", + "spdx_license_expression": "Artistic-1.0", + "from_file": "LICENSE4", + "start_line": 283, + "end_line": 372, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 729, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "artistic-1.0_4.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/artistic-1.0_4.RULE" + }, + { + "license_expression": "warranty-disclaimer", + "spdx_license_expression": "LicenseRef-scancode-warranty-disclaimer", + "from_file": "LICENSE4", + "start_line": 374, + "end_line": 376, + "matcher": "2-aho", + "score": 100.0, + "matched_length": 26, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "warranty-disclaimer_72.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/warranty-disclaimer_72.RULE" + } + ], + "identifier": "gpl_1_0_plus_or_artistic_1_0__and_gpl_1_0_and_artistic_1_0_and_warranty_disclaimer-a1de207d-6b62-18b8-f372-e9820f733de3" + } + ], + "license_clues": [], + "percentage_of_license_text": 98.57, + "copyrights": [ + { + "copyright": "copyright (c) 2013 by Mark Jason Dominus ", + "start_line": 1, + "end_line": 1 + }, + { + "copyright": "Copyright (c) 2013 by Mark Jason Dominus ", + "start_line": 15, + "end_line": 15 + }, + { + "copyright": "Copyright (c) 2013 by Mark Jason Dominus ", + "start_line": 275, + "end_line": 275 + } + ], + "holders": [ + { + "holder": "Mark Jason Dominus", + "start_line": 1, + "end_line": 1 + }, + { + "holder": "Mark Jason Dominus", + "start_line": 15, + "end_line": 15 + }, + { + "holder": "Mark Jason Dominus", + "start_line": 275, + "end_line": 275 + } + ], + "authors": [], + "emails": [], + "urls": [], + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/cluecode/test_plugin_filter_clues.py b/tests/cluecode/test_plugin_filter_clues.py index 77c85c6499a..9e5fb950d92 100644 --- a/tests/cluecode/test_plugin_filter_clues.py +++ b/tests/cluecode/test_plugin_filter_clues.py @@ -18,7 +18,6 @@ from scancode_config import REGEN_TEST_FIXTURES - test_env = FileDrivenTesting() test_env.test_data_dir = os.path.join(os.path.dirname(__file__), 'data') @@ -34,7 +33,7 @@ def test_is_empty_(): def test_scan_plugin_filter_clues_for_rule(): - # this test fies is a copy of apache-1.1_63.RULE that contains + # this test file is a copy of apache-1.1_63.RULE that contains # several emails, authors, urls and copyrights # it has been modified to include more unrelated clues test_dir = test_env.get_test_loc('plugin_filter_clues/files/LICENSE') @@ -67,3 +66,16 @@ def test_scan_plugin_filter_clues_for_license(): run_scan_click(args) expected = test_env.get_test_loc('plugin_filter_clues/filtered-expected3.json') check_json_scan(expected, result_file, remove_file_date=True, regen=REGEN_TEST_FIXTURES) + + +# Regression on data structure tracked in https://github.com/nexB/scancode-toolkit/issues/3797 +def test_scan_plugin_filter_copyrights_for_license(): + # this test fies is a copy of pcre.LICENSE that contains + # several emails, authors, urls + test_dir = test_env.get_test_loc('plugin_filter_clues/files/LICENSE4') + result_file = test_env.get_temp_file('json') + args = ['-clieu', '--filter-clues', test_dir, '--json', result_file] + run_scan_click(args) + expected = test_env.get_test_loc('plugin_filter_clues/filtered-expected4.json', must_exist=False) + check_json_scan(expected, result_file, remove_file_date=True, regen=REGEN_TEST_FIXTURES) +