Open
Description
Hi folks,
I know I should "git with it" and fork, mod, and make pull requests. I haven't done that and I'm SWAMPED with work. But, I'm trying to get my fixes (or at least, fixes for my needs) back to you. I hit a corner case in Content.py where the last ContentJob wasn't being flushed out to the database. Running cvsanaly2 with -g (debug) showed the query being made, but nothing was showing up in the DB. So, I did some mucking around and (1) rewrote Content.__process_finished_jobs() (based off of Blame.process_finished_jobs()) and (2) tweaked and cleaned up Content.run(). I don't know if I added some other regressions in the process.
Best,
Mark
--- Content.py 2011-07-29 15:58:49.000000000 -0400
+++ /usr/lib/python2.7/site-packages/cvsanaly-2.4-py2.7.egg/pycvsanaly2/extensions/Content.py 2011-07-29 16:43:55.753943196 -0400
@@ -272,38 +272,58 @@
connection.commit()
- def __process_finished_jobs(self, job_pool, write_cursor, db):
-# start = datetime.now()
- finished_job = job_pool.get_next_done(0)
- processed_jobs = 0
- # commit_id is the commit ID. For some reason, the
- # documentation advocates tablename_id as the reference,
- # but in the source, these are referred to as commit IDs.
- # Don't ask me why!
- while finished_job is not None:
+ #
+ # MEF 2011-07-29
+ # (seems to be a corner case where the last Content job isn't getting
+ # flushed out. trying to fix.) ... It is working for me now. I did
+ # two things:
+ # (1) I rewrote __process_finsihed_jobs and named it
+ # process_finished_jobs. I based its structure off of Blame.py
+ # which seemed to have a cleaner processing model
+ # (2) I modified the job_pool.join() command in the .run(). See below.
+ #
+ def process_finished_jobs(self, job_pool, write_cursor, unlocked=False):
+ #
+ # Build insertion SQL
+ #
+ insertContentQuery = """INSERT INTO content
+ (commit_id, file_id, content, loc, size)
+ VALUES (?,?,?,?,?)"""
+
+ #
+ # get first job
+ #
+ if unlocked:
+ job = job_pool.get_next_done_unlocked()
+ else:
+ job = job_pool.get_next_done(0.5)
+
+ listOfQueryArgs = []
+ processedJobCt = 0
+ while job is not None:
+ # build contents
file_contents = None
-
if not Config().no_content:
- file_contents = str(finished_job.file_contents)
+ file_contents = str(job.file_contents)
+ # build full query args for insertContentQuery
+ thisQueryArgs = (job.commit_id, job.file_id,
+ file_contents,
+ job.file_number_of_lines,
+ job.file_size)
+ listOfQueryArgs.append(thisQueryArgs)
+ processedJobCt += 1
+
+ # get next job
+ if unlocked:
+ job = job_pool.get_next_done_unlocked()
+ else:
+ job = job_pool.get_next_done(0)
+ if listOfQueryArgs:
+ write_cursor.executemany(statement(insertContentQuery,
+ self.db.place_holder),
+ listOfQueryArgs)
+ return processedJobCt
- query = """
- insert into content(commit_id, file_id, content, loc, size)
- values(?,?,?,?,?)"""
- insert_statement = statement(query, db.place_holder)
- parameters = (finished_job.commit_id,
- finished_job.file_id,
- file_contents,
- finished_job.file_number_of_lines,
- finished_job.file_size)
-
- execute_statement(insert_statement, parameters, write_cursor, db,
- "Couldn't insert, duplicate record?",
- exception=ExtensionRunError)
-
- processed_jobs += 1
- finished_job = job_pool.get_next_done(0)
-
- return processed_jobs
def run(self, repo, uri, db):
# Start the profiler, per every other extension
@@ -345,11 +365,13 @@
raise ExtensionRunError("Couldn't prepare table because " + \
str(e))
- queuesize = Config().max_threads
- printdbg("Setting queuesize to " + str(queuesize))
+ maxJobQueueSize = Config().max_threads
+ printdbg("Setting maxJobQueueSize to " + str(maxJobQueueSize))
# This is where the threading stuff comes in, I expect
- job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize)
+ job_pool = JobPool(repo,
+ path or repo.get_uri(),
+ queuesize=maxJobQueueSize) #queuesize)
# This filters files if they're not source files.
# I'm pretty sure "unknown" is returning binary files too, but
@@ -360,6 +382,10 @@
"ft.type in('code') and " + \
"f.repository_id = ?"
# "ft.type in('code', 'unknown') and " + \
+
+ #
+ # find existing content
+ #
read_cursor.execute(statement(query, db.place_holder), (repo_id,))
code_files = [item[0] for item in read_cursor.fetchall()]
query = """select c.file_id, c.commit_id from content c, files f
@@ -369,17 +395,25 @@
existing_content = [(item[0], item[1]) \
for item in read_cursor.fetchall()]
+ #
+ # Get commit x file x action x composed
+ #
fr = FileRevs(db, connection, read_cursor, repo_id)
- i = 0
+ currJobQueueSize = 0
# Loop through each file and its revision
for revision, commit_id, file_id, action_type, composed in fr:
-# loop_start = datetime.now()
+ #
+ # skip non code files and existing contetn
+ #
if file_id not in code_files:
continue
if (file_id, commit_id) in existing_content:
continue
+ #
+ # compute revision and proper path
+ #
try:
relative_path = fr.get_path()
except AttributeError, e:
@@ -398,28 +432,52 @@
printdbg("Skipping file %s", (relative_path,))
continue
+ #
+ # create a content fetching job
+ #
job = ContentJob(commit_id, file_id, rev, relative_path)
job_pool.push(job)
- i = i + 1
- if i >= queuesize:
- printdbg("Content queue is now at %d, flushing to database",
- (i,))
+ currJobQueueSize += 1 # i = i+1
+
+ #
+ # many job queue
+ #
+ if currJobQueueSize >= maxJobQueueSize:
+ printdbg("Content job queue is now at %d. Flushing to database",
+ (currJobQueueSize,))
- processed_jobs = self.__process_finished_jobs(job_pool,
- write_cursor, db)
+ numProcessedJobs = self.process_finished_jobs(job_pool,
+ write_cursor)
connection.commit()
- i = i - processed_jobs
- if processed_jobs < (queuesize / 5):
- job_pool.join()
+ currJobQueueSize -= numProcessedJobs
+ job_pool.join()
- job_pool.join()
- self.__process_finished_jobs(job_pool, write_cursor, db)
+ #
+ # MEF 2011-07-29
+ # other possible source of fault:
+ # if the num of completed jobs was "too big",
+ # then the current job was never joined in
+ # unless that would happen at the job_pool.join()
+ # outside the loop
+ #
- profiler_start("Inserting results in db")
- #self.__insert_many(write_cursor)
+ #if processed_jobs < (queuesize / 5):
+ # job_pool.join()
+
+ #
+ # process remaining content jobs
+ #
+ job_pool.join()
+ self.process_finished_jobs(job_pool, write_cursor)
+
+ #
+ # force commit
+ #
connection.commit()
- profiler_stop("Inserting results in db")
+ #
+ # clean up connections and cursors
+ #
read_cursor.close()
write_cursor.close()
connection.close()
Metadata
Metadata
Assignees
Labels
No labels