Content.py

Hi folks,

I know I should "git with it" and fork, mod, and make pull requests.  I haven't done that and I'm SWAMPED with work.  But, I'm trying to get my fixes (or at least, fixes for my needs) back to you.  I hit a corner case in Content.py where the last ContentJob wasn't being flushed out to the database.  Running cvsanaly2 with -g (debug) showed the query being made, but nothing was showing up in the DB.  So, I did some mucking around and (1) rewrote Content.__process_finished_jobs() (based off of Blame.process_finished_jobs()) and (2) tweaked and cleaned up Content.run().  I don't know if I added some other regressions in the process.

Best,
Mark

``` diff

--- Content.py  2011-07-29 15:58:49.000000000 -0400
+++ /usr/lib/python2.7/site-packages/cvsanaly-2.4-py2.7.egg/pycvsanaly2/extensions/Content.py   2011-07-29 16:43:55.753943196 -0400
@@ -272,38 +272,58 @@

         connection.commit()

-    def __process_finished_jobs(self, job_pool, write_cursor, db):
-#        start = datetime.now()
-        finished_job = job_pool.get_next_done(0)
-        processed_jobs = 0
-        # commit_id is the commit ID. For some reason, the 
-        # documentation advocates tablename_id as the reference,
-        # but in the source, these are referred to as commit IDs.
-        # Don't ask me why!
-        while finished_job is not None:
+    #
+    # MEF 2011-07-29
+    # (seems to be a corner case where the last Content job isn't getting
+    #  flushed out.  trying to fix.) ... It is working for me now.  I did
+    #  two things:
+    #  (1) I rewrote __process_finsihed_jobs and named it
+    #      process_finished_jobs.  I based its structure off of Blame.py
+    #      which seemed to have a cleaner processing model
+    #  (2) I modified the job_pool.join() command in the .run(). See below.
+    #
+    def process_finished_jobs(self, job_pool, write_cursor, unlocked=False):
+        #
+        # Build insertion SQL
+        #
+        insertContentQuery = """INSERT INTO content
+                                (commit_id, file_id, content, loc, size) 
+                                VALUES (?,?,?,?,?)"""
+
+        #
+        # get first job
+        #
+        if unlocked:
+            job = job_pool.get_next_done_unlocked()
+        else:
+            job = job_pool.get_next_done(0.5)
+
+        listOfQueryArgs = []
+        processedJobCt = 0
+        while job is not None:
+            # build contents
             file_contents = None
-                        
             if not Config().no_content:
-                file_contents = str(finished_job.file_contents)
+                file_contents = str(job.file_contents)
+            # build full query args for insertContentQuery
+            thisQueryArgs = (job.commit_id, job.file_id,
+                             file_contents,
+                             job.file_number_of_lines,
+                             job.file_size)
+            listOfQueryArgs.append(thisQueryArgs)
+            processedJobCt += 1
+
+            # get next job
+            if unlocked:
+                job = job_pool.get_next_done_unlocked()
+            else:
+                job = job_pool.get_next_done(0)
+        if listOfQueryArgs:
+            write_cursor.executemany(statement(insertContentQuery,
+                                               self.db.place_holder),
+                                     listOfQueryArgs)
+        return processedJobCt

-            query = """
-                insert into content(commit_id, file_id, content, loc, size) 
-                    values(?,?,?,?,?)"""
-            insert_statement = statement(query, db.place_holder)
-            parameters = (finished_job.commit_id,
-                          finished_job.file_id,
-                          file_contents,
-                          finished_job.file_number_of_lines,
-                          finished_job.file_size)
-                                
-            execute_statement(insert_statement, parameters, write_cursor, db,
-                       "Couldn't insert, duplicate record?", 
-                       exception=ExtensionRunError)
-            
-            processed_jobs += 1
-            finished_job = job_pool.get_next_done(0)
-            
-        return processed_jobs

     def run(self, repo, uri, db):
         # Start the profiler, per every other extension
@@ -345,11 +365,13 @@
             raise ExtensionRunError("Couldn't prepare table because " + \
                                     str(e))

-        queuesize = Config().max_threads
-        printdbg("Setting queuesize to " + str(queuesize))
+        maxJobQueueSize = Config().max_threads
+        printdbg("Setting maxJobQueueSize to " + str(maxJobQueueSize))

         # This is where the threading stuff comes in, I expect
-        job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize)
+        job_pool = JobPool(repo,
+                           path or repo.get_uri(),
+                           queuesize=maxJobQueueSize) #queuesize)

         # This filters files if they're not source files.
         # I'm pretty sure "unknown" is returning binary files too, but
@@ -360,6 +382,10 @@
                 "ft.type in('code') and " + \
                 "f.repository_id = ?"
                 # "ft.type in('code', 'unknown') and " + \
+
+        #
+        # find existing content
+        #
         read_cursor.execute(statement(query, db.place_holder), (repo_id,))
         code_files = [item[0] for item in read_cursor.fetchall()]
         query = """select c.file_id, c.commit_id from content c, files f
@@ -369,17 +395,25 @@
         existing_content = [(item[0], item[1]) \
                             for item in read_cursor.fetchall()]

+        #
+        # Get commit x file x action x composed
+        #
         fr = FileRevs(db, connection, read_cursor, repo_id)

-        i = 0
+        currJobQueueSize = 0 
         # Loop through each file and its revision
         for revision, commit_id, file_id, action_type, composed in fr:
-#            loop_start = datetime.now()
+            #
+            # skip non code files and existing contetn
+            #
             if file_id not in code_files:
                 continue
             if (file_id, commit_id) in existing_content:
                 continue

+            #
+            # compute revision and proper path
+            #
             try:
                 relative_path = fr.get_path()
             except AttributeError, e:
@@ -398,28 +432,52 @@
                 printdbg("Skipping file %s", (relative_path,))
                 continue

+            #
+            # create a content fetching job
+            #
             job = ContentJob(commit_id, file_id, rev, relative_path)
             job_pool.push(job)
-            i = i + 1
-            if i >= queuesize:
-                printdbg("Content queue is now at %d, flushing to database", 
-                         (i,))
+            currJobQueueSize += 1 # i = i+1
+
+            #
+            # many job queue
+            # 
+            if currJobQueueSize >= maxJobQueueSize:
+                printdbg("Content job queue is now at %d. Flushing to database", 
+                         (currJobQueueSize,))

-                processed_jobs = self.__process_finished_jobs(job_pool, 
-                                                              write_cursor, db)
+                numProcessedJobs = self.process_finished_jobs(job_pool,
+                                                              write_cursor)
                 connection.commit()
-                i = i - processed_jobs
-                if processed_jobs < (queuesize / 5):
-                    job_pool.join()
+                currJobQueueSize -= numProcessedJobs
+                job_pool.join()

-        job_pool.join()
-        self.__process_finished_jobs(job_pool, write_cursor, db)
+                #
+                # MEF 2011-07-29
+                # other possible source of fault:
+                # if the num of completed jobs was "too big",
+                # then the current job was never joined in
+                # unless that would happen at the job_pool.join()
+                # outside the loop
+                #

-        profiler_start("Inserting results in db")
-        #self.__insert_many(write_cursor)
+                #if processed_jobs < (queuesize / 5):
+                #    job_pool.join()
+
+        #
+        # process remaining content jobs
+        #
+        job_pool.join()
+        self.process_finished_jobs(job_pool, write_cursor)                
+
+        #
+        # force commit
+        #
         connection.commit()
-        profiler_stop("Inserting results in db")

+        #
+        # clean up connections and cursors
+        #
         read_cursor.close()
         write_cursor.close()
         connection.close()
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Content.py #107

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Content.py #107

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions