Skip to content

Content.py #107

Open
Open
@mfenner1

Description

@mfenner1

Hi folks,

I know I should "git with it" and fork, mod, and make pull requests. I haven't done that and I'm SWAMPED with work. But, I'm trying to get my fixes (or at least, fixes for my needs) back to you. I hit a corner case in Content.py where the last ContentJob wasn't being flushed out to the database. Running cvsanaly2 with -g (debug) showed the query being made, but nothing was showing up in the DB. So, I did some mucking around and (1) rewrote Content.__process_finished_jobs() (based off of Blame.process_finished_jobs()) and (2) tweaked and cleaned up Content.run(). I don't know if I added some other regressions in the process.

Best,
Mark

--- Content.py  2011-07-29 15:58:49.000000000 -0400
+++ /usr/lib/python2.7/site-packages/cvsanaly-2.4-py2.7.egg/pycvsanaly2/extensions/Content.py   2011-07-29 16:43:55.753943196 -0400
@@ -272,38 +272,58 @@

         connection.commit()

-    def __process_finished_jobs(self, job_pool, write_cursor, db):
-#        start = datetime.now()
-        finished_job = job_pool.get_next_done(0)
-        processed_jobs = 0
-        # commit_id is the commit ID. For some reason, the 
-        # documentation advocates tablename_id as the reference,
-        # but in the source, these are referred to as commit IDs.
-        # Don't ask me why!
-        while finished_job is not None:
+    #
+    # MEF 2011-07-29
+    # (seems to be a corner case where the last Content job isn't getting
+    #  flushed out.  trying to fix.) ... It is working for me now.  I did
+    #  two things:
+    #  (1) I rewrote __process_finsihed_jobs and named it
+    #      process_finished_jobs.  I based its structure off of Blame.py
+    #      which seemed to have a cleaner processing model
+    #  (2) I modified the job_pool.join() command in the .run(). See below.
+    #
+    def process_finished_jobs(self, job_pool, write_cursor, unlocked=False):
+        #
+        # Build insertion SQL
+        #
+        insertContentQuery = """INSERT INTO content
+                                (commit_id, file_id, content, loc, size) 
+                                VALUES (?,?,?,?,?)"""
+
+        #
+        # get first job
+        #
+        if unlocked:
+            job = job_pool.get_next_done_unlocked()
+        else:
+            job = job_pool.get_next_done(0.5)
+
+        listOfQueryArgs = []
+        processedJobCt = 0
+        while job is not None:
+            # build contents
             file_contents = None
-                        
             if not Config().no_content:
-                file_contents = str(finished_job.file_contents)
+                file_contents = str(job.file_contents)
+            # build full query args for insertContentQuery
+            thisQueryArgs = (job.commit_id, job.file_id,
+                             file_contents,
+                             job.file_number_of_lines,
+                             job.file_size)
+            listOfQueryArgs.append(thisQueryArgs)
+            processedJobCt += 1
+
+            # get next job
+            if unlocked:
+                job = job_pool.get_next_done_unlocked()
+            else:
+                job = job_pool.get_next_done(0)
+        if listOfQueryArgs:
+            write_cursor.executemany(statement(insertContentQuery,
+                                               self.db.place_holder),
+                                     listOfQueryArgs)
+        return processedJobCt

-            query = """
-                insert into content(commit_id, file_id, content, loc, size) 
-                    values(?,?,?,?,?)"""
-            insert_statement = statement(query, db.place_holder)
-            parameters = (finished_job.commit_id,
-                          finished_job.file_id,
-                          file_contents,
-                          finished_job.file_number_of_lines,
-                          finished_job.file_size)
-                                
-            execute_statement(insert_statement, parameters, write_cursor, db,
-                       "Couldn't insert, duplicate record?", 
-                       exception=ExtensionRunError)
-            
-            processed_jobs += 1
-            finished_job = job_pool.get_next_done(0)
-            
-        return processed_jobs

     def run(self, repo, uri, db):
         # Start the profiler, per every other extension
@@ -345,11 +365,13 @@
             raise ExtensionRunError("Couldn't prepare table because " + \
                                     str(e))

-        queuesize = Config().max_threads
-        printdbg("Setting queuesize to " + str(queuesize))
+        maxJobQueueSize = Config().max_threads
+        printdbg("Setting maxJobQueueSize to " + str(maxJobQueueSize))

         # This is where the threading stuff comes in, I expect
-        job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize)
+        job_pool = JobPool(repo,
+                           path or repo.get_uri(),
+                           queuesize=maxJobQueueSize) #queuesize)

         # This filters files if they're not source files.
         # I'm pretty sure "unknown" is returning binary files too, but
@@ -360,6 +382,10 @@
                 "ft.type in('code') and " + \
                 "f.repository_id = ?"
                 # "ft.type in('code', 'unknown') and " + \
+
+        #
+        # find existing content
+        #
         read_cursor.execute(statement(query, db.place_holder), (repo_id,))
         code_files = [item[0] for item in read_cursor.fetchall()]
         query = """select c.file_id, c.commit_id from content c, files f
@@ -369,17 +395,25 @@
         existing_content = [(item[0], item[1]) \
                             for item in read_cursor.fetchall()]

+        #
+        # Get commit x file x action x composed
+        #
         fr = FileRevs(db, connection, read_cursor, repo_id)

-        i = 0
+        currJobQueueSize = 0 
         # Loop through each file and its revision
         for revision, commit_id, file_id, action_type, composed in fr:
-#            loop_start = datetime.now()
+            #
+            # skip non code files and existing contetn
+            #
             if file_id not in code_files:
                 continue
             if (file_id, commit_id) in existing_content:
                 continue

+            #
+            # compute revision and proper path
+            #
             try:
                 relative_path = fr.get_path()
             except AttributeError, e:
@@ -398,28 +432,52 @@
                 printdbg("Skipping file %s", (relative_path,))
                 continue

+            #
+            # create a content fetching job
+            #
             job = ContentJob(commit_id, file_id, rev, relative_path)
             job_pool.push(job)
-            i = i + 1
-            if i >= queuesize:
-                printdbg("Content queue is now at %d, flushing to database", 
-                         (i,))
+            currJobQueueSize += 1 # i = i+1
+
+            #
+            # many job queue
+            # 
+            if currJobQueueSize >= maxJobQueueSize:
+                printdbg("Content job queue is now at %d. Flushing to database", 
+                         (currJobQueueSize,))

-                processed_jobs = self.__process_finished_jobs(job_pool, 
-                                                              write_cursor, db)
+                numProcessedJobs = self.process_finished_jobs(job_pool,
+                                                              write_cursor)
                 connection.commit()
-                i = i - processed_jobs
-                if processed_jobs < (queuesize / 5):
-                    job_pool.join()
+                currJobQueueSize -= numProcessedJobs
+                job_pool.join()

-        job_pool.join()
-        self.__process_finished_jobs(job_pool, write_cursor, db)
+                #
+                # MEF 2011-07-29
+                # other possible source of fault:
+                # if the num of completed jobs was "too big",
+                # then the current job was never joined in
+                # unless that would happen at the job_pool.join()
+                # outside the loop
+                #

-        profiler_start("Inserting results in db")
-        #self.__insert_many(write_cursor)
+                #if processed_jobs < (queuesize / 5):
+                #    job_pool.join()
+
+        #
+        # process remaining content jobs
+        #
+        job_pool.join()
+        self.process_finished_jobs(job_pool, write_cursor)                
+
+        #
+        # force commit
+        #
         connection.commit()
-        profiler_stop("Inserting results in db")

+        #
+        # clean up connections and cursors
+        #
         read_cursor.close()
         write_cursor.close()
         connection.close()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions