contrib/copyright_check.py

# Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
""" This is a debian copyright file checker.  Put debian/copyright
    file conforming to http://dep.debian.net/deps/dep5/ and
    this program tells you which copyright holders you missed.

    Limitations:

    * for each section, you must put the full set of copyright holders.
      whilst the file lists are "carried over" i.e. later sections
      override earlier ones (see "remove_files()"), the same trick is
      NOT applied to copyright holders.

    * the qgram algorithm is applied to do some fuzzy string matching.
      it's pretty good, but don't rely on it to be perfect.

    * copyright year matching goes against "199" and "200" not
      "198?", "199?", "200?" and certainly not "201?".  if a name
      happens to have "199" or "200" in it, on a line that happens
      to have the word "copyright" in it, it gets assumed to be
      a copyright holder

    * random sentences tacked onto the end of copyrights in files
      are assumed to be part of the copyright holders' name

    * copyrights are assumed to be in the first 80 lines of the file

    * if the file doesn't _have_ a copyright notice, this program can't
      bloody well find it, can it??
"""

import glob
import sys
import os
from string import strip

# qgram: a way to "roughly" match words.  you're supposed to set splitlen
# to half the length of the average word, but 3 is good enough.
def qgram_set(word, splitlen):
    s = set()
    pad = '\0'*(splitlen-1)
    word = pad + word + pad
    for idx in range(len(word)-splitlen):
        s.add(word[idx:idx+splitlen])
    return s

def qgram(word1, word2, splitlen=3):
    s1 = qgram_set(word1, splitlen)
    s2 = qgram_set(word2, splitlen)
    un = len(s1.union(s2))
    ic = len(s1.intersection(s2))
    return float(ic) / float(un)

def truncate_qgram(word1, word2):
    if word1 == word2:
        return 1.0
    qg = 0
    if len(word1) > len(word2):
        tmp = word1
        word1 = word2
        word2 = tmp
    for i in range(len(word1), len(word2)+1):
        qg = max(qgram(word1, word2[:i]), qg)
    return qg

# testing, testing... ok, it works.
#print qgram("Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>",
#             "Copyright (C) 2006, Google, Inc.")
#print qgram("Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>",
#            "Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>")
#print qgram("Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>",
#             "Copyright (c) 2008, Luke Kenneth Casson Leighton")
#print qgram("Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>",
#             "Copyright (c) 2008, 2009, Luke Kenneth Casson Leighton ")
#print qgram("Copyright (C) 2009, Luke Kenneth Casson Leighton",
#             "Copyright (c) 2008, 2009, Luke Kenneth Casson Leighton ")

def check_match(word, word_list):
    matches = set()
    not_matches = set()
    for word2 in word_list:
        match = truncate_qgram(word, word2)
        if match > 0.6:
            matches.add((word, word2))
        else:
            not_matches.add((word, word2))
    return matches, not_matches

def sanitise(copyright):
    if copyright[0] == ':':
        copyright = copyright[1:].strip()
    co = "(c)"
    fco = copyright.lower().find(co)
    if fco >= 0:
        copyright = copyright[:fco] + copyright[fco+len(co):]
    srrs = "some rights reserved"
    srr = copyright.lower().find(srrs)
    if srr >= 0:
        copyright = copyright[:srr] + copyright[srr+len(srrs):]
    arrs = "all rights reserved"
    arr = copyright.lower().find(arrs)
    if arr >= 0:
        copyright = copyright[:arr] + copyright[arr+len(arrs):]
    return copyright
    # hmmm... something not quite right here...
    res = ''
    for c in copyright:
        if c.isalnum():
            res += c
        else:
            res += ' '
    res = res.split(' ')
    res = filter(lambda x:x, res)
    return ' '.join(res)

def find_file_copyright_notices(fname):
    ret = set()
    f = open(fname)
    lines = f.readlines()
    for l in lines[:80]: # hmmm, assume copyright to be in first 80 lines
        idx = l.lower().find("copyright")
        if idx < 0:
            continue
        copyright = l[idx+9:].strip()
        if not copyright:
            continue
        copyright = sanitise(copyright)
        # hmm, do a quick check to see if there's a year,
        # if not, skip it
        if not copyright.find("200") >= 0 and \
           not copyright.find("199") >= 0 :
           continue
        ret.add(copyright)
    return ret

def skip_file(fname):
    if fname.startswith(".svn"):
        return True
    if fname.startswith(".git"):
        return True
    if fname.startswith(".sw"):
        return True
    if fname == "output": # no thanks
        return True
    if fname.find("PureMVC_Python_1_0") >= 0: # no thanks
        return True
    if fname.endswith(".pyc"): # ehmm.. no.
        return True
    if fname.endswith(".java"): # no again
        return True
    return False

def get_files(d):
    res = []
    for p in glob.glob(os.path.join(d, "*")):
        if not p:
            continue
        (pth, fname) = os.path.split(p)
        if skip_file(fname):
            continue
        if os.path.islink(p):
            continue
        if os.path.isdir(p):
            res += get_dir(p)
        else:
            res.append(p)
    return res

def get_dir(match):
    data_files = []
    for d in glob.glob(match):
        if skip_file(d):
            continue
        if os.path.islink(d):
            continue
        if os.path.isdir(d):
            (pth, fname) = os.path.split(d)
            expath = get_files(d)
            data_files += expath
        else:
            data_files.append(d)
    return data_files

class DebSect:
    def __init__(self, pattern, files):
        self.file_pattern = pattern
        self.files = files
        self.copyrights = set()
        self.listed_copyrights = set()
        self.files_by_author = {}

    def read_files_for_copyrights(self):
        for fname in self.files:
            if fname.endswith("copyright_check.py"): # skip this program!
                continue
            if fname == 'copyright': # skip this one duh
                continue
            cops = find_file_copyright_notices(fname)
            self.listed_copyrights.update(cops)
            for c in cops:
                if not self.files_by_author.has_key(c):
                    self.files_by_author[c] = set()
                if fname not in self.files_by_author[c]:
                    self.files_by_author[c].add(fname)
        print "Pattern", self.file_pattern
        for author in self.copyrights:
            print "Copyright:", author
        for author in self.listed_copyrights:
            print "Listed Copyright:", author

    def remove_files(self, to_remove):
        for fname in to_remove:
            if fname in self.files:
                self.files.remove(fname)

    def check_copyright_matches(self):
        self.matches = set()
        self.not_matches = set()

        for author in self.listed_copyrights:
            matches, not_matches = check_match(author, self.listed_copyrights)
            self.matches.update(matches)
            for (word1, word2) in not_matches:
                matches1, not_matches1 = check_match(word2, self.copyrights)
                #print "matches1, not_matches1", word1, word2, matches1, not_matches1
                if len(matches1) > 0:
                    continue
                #print "not matches", repr(word2), self.copyrights
                #print self.files_by_author[word2]
                self.not_matches.add(word2)

        if self.not_matches:
            print
            print"   ** ** ** ** **"
            for m in self.not_matches:
                print "   ** not matches:", m
                for fname in self.files_by_author[m]:
                    print"   ** ** ** ** **:", fname
        print

all_files = get_dir("*")
copyright_sects = []
all_listed_files = []

#print "all files", all_files

# read debian/copyright file and collect all matched files,
# copyrights and licenses
current_debsect = None
current_copyrights = set()
current_licenses = set()

dc = open("copyright")
for l in dc.readlines():
    if l.startswith("License:"):
        current_licenses.add(strip(l[8:]))
        continue
    if l.startswith("Copyright:"):
        current_copyrights.add(sanitise(strip(l[10:])))
        continue
    if not l.startswith("Files:"):
        continue
    if current_debsect:
        current_debsect.licenses = current_licenses
        current_debsect.copyrights = current_copyrights
        current_copyrights = set()
        current_licenses = set()
    l = l.split(" ")
    l = map(strip, l)
    listed_files = []
    for pattern in l[1:]:
        if pattern[-1] == ',':
            pattern = pattern[:-1]
        files = get_dir(pattern)
        listed_files += files
        all_listed_files += files
    current_debsect = DebSect(l[1:], listed_files)
    copyright_sects.append(current_debsect)

if current_debsect:
    current_debsect.copyrights = current_copyrights
    current_debsect.licenses = current_licenses

dc.close()

# remove already-matching: further down takes precedence
for i in range(1, len(copyright_sects)):
    for j in range(i):
        #print i, j, copyright_sects[i].file_pattern, copyright_sects[j].file_pattern
        copyright_sects[j].remove_files(copyright_sects[i].files)

for dc in copyright_sects:
    dc.read_files_for_copyrights()
    dc.check_copyright_matches()
    print

#def check_in(l1, l2):
#    res = []
#    for fname in l1:
#        if fname not in l2:
#            res.append(fname)
#    return res
#
#not_in = check_in(all_files, listed_files)
#for fname in not_in:
#    print fname
#print listed_files
#print check_in(listed_files, all_files)