Merge pull request #2898 from chaoss/augur-release-0.76.2

Augur release 0.76.2
chaoss · Sep 25, 2024 · c8eba65 · c8eba65
2 parents fe11b0e + ef39e84
commit c8eba65
Showing 55 changed files with 15,473 additions and 2,391 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ augur_export_env.sh
 config.yml
 reports.yml
 *.pid
+*.sock
 
 node_modules/
 .idea/

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Augur NEW Release v0.76.1
+# Augur NEW Release v0.76.2
 
 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! 
 The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io 
@@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
 ## NEW RELEASE ALERT!
 ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)
 
-Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1  
+Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.2 
 
 - The `main` branch is a stable version of our new architecture, which features:
   - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.

diff --git a/augur/api/metrics/deps.py b/augur/api/metrics/deps.py
@@ -77,4 +77,162 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No
     return results
 
 
+@register_metric()
+def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None):
+    """
+    Returns a list of all the dependencies in a project/repo/repo_group.
+
+    :param repo_id: The repository's id
+    :param repo_group_id: The repository's group id
+    :param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day'
+    :param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
+    :param end_date: Specifies the end date, defaults to datetime.now()
+    :return: DataFrame of persons/period
+    """
+
+    if not begin_date:
+        begin_date = '1970-1-1 00:00:01'
+    if not end_date:
+        end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+    if repo_id:
+
+        libyearSQL = s.sql.text("""
+            SELECT
+                rg_name,
+                repo_group_id,
+                repo_name,
+                d.repo_id,
+                repo_git,
+                forked_from,
+                repo_archived,
+                c.name, 
+                c.libyear,
+                MAX ( C.data_collection_date ) AS most_recent_collection
+            FROM
+                (
+                SELECT A.rg_name AS rg_name,
+                    A.repo_group_id AS repo_group_id,
+                    b.repo_name AS repo_name,
+                    b.repo_id AS repo_id,
+                    b.repo_git AS repo_git,
+                    b.forked_from AS forked_from,
+                    b.repo_archived AS repo_archived 
+                FROM
+                    repo_groups A,
+                    repo b 
+                WHERE
+                    A.repo_group_id = b.repo_group_id 
+                ORDER BY
+                    rg_name,
+                    repo_name 
+                ) d,
+                (
+                SELECT DISTINCT
+                    f.repo_id,
+                    f.NAME,
+                    f.libyear, 
+                    f.data_collection_date
+                FROM
+                    ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e,
+                    augur_data.repo_deps_libyear f 
+                WHERE
+                    e.data_collection_date = f.data_collection_date and 
+                    e.repo_id = f.repo_id 
+                ORDER BY
+                NAME 
+                ) C 
+            WHERE
+                d.repo_id = C.repo_id 
+                AND C.repo_id = :repo_id
+            GROUP BY
+                rg_name,
+                repo_git,
+                repo_group_id,
+                repo_name,
+                d.repo_id,
+                forked_from,
+                repo_archived, 
+                c.name, 
+                c.libyear
+            ORDER BY
+                repo_id;
+            """)
+
+        with current_app.engine.connect() as conn:
+            results = pd.read_sql(libyearSQL, conn, params={'repo_id': repo_id})    	
+
+    else:
+
+        libyearSQL = s.sql.text("""
+            Select w.* from 
+            (
+            SELECT
+                rg_name,
+                repo_group_id,
+                repo_name,
+                d.repo_id,
+                repo_git,
+                forked_from,
+                repo_archived,
+                c.name, 
+                c.libyear,
+                MAX ( C.data_collection_date ) AS most_recent_collection
+            FROM
+                (
+                SELECT A.rg_name AS rg_name,
+                    A.repo_group_id AS repo_group_id,
+                    b.repo_name AS repo_name,
+                    b.repo_id AS repo_id,
+                    b.repo_git AS repo_git,
+                    b.forked_from AS forked_from,
+                    b.repo_archived AS repo_archived 
+                FROM
+                    repo_groups A,
+                    repo b 
+                WHERE
+                    A.repo_group_id = b.repo_group_id 
+                ORDER BY
+                    rg_name,
+                    repo_name 
+                ) d,
+                (
+                SELECT DISTINCT
+                    f.repo_id,
+                    f.NAME,
+                    f.libyear, 
+                    f.data_collection_date
+                FROM
+                    ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e,
+                    augur_data.repo_deps_libyear f 
+                WHERE
+                    e.data_collection_date = f.data_collection_date and 
+                    e.repo_id = f.repo_id 
+                ORDER BY
+                NAME 
+                ) C 
+            WHERE
+                d.repo_id = C.repo_id 
+            GROUP BY
+                rg_name,
+                repo_git,
+                repo_group_id,
+                repo_name,
+                d.repo_id,
+                forked_from,
+                repo_archived, 
+                c.name, 
+                c.libyear
+            ORDER BY
+                repo_id) w, 
+                repo_groups y, 
+                repo z 
+            where w.repo_id=z.repo_id and 
+            y.repo_group_id=z.repo_group_id
+            and z.repo_group_id = :repo_group_id
+            """)
+
+        with current_app.engine.connect() as conn:
+            results = pd.read_sql(libyearSQL, conn, params={'repo_group_id': repo_group_id})
+    return results
 
diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py
@@ -21,6 +21,12 @@
 from bokeh.models.glyphs import Rect
 from bokeh.transform import dodge, factor_cmap, transform
 
+# from selenium.webdriver import Firefox, FirefoxOptions
+# options = FirefoxOptions()
+# options.headless = True
+# webdriver = Firefox(options=options)
+#export_png(item, path, webdriver=webdriver)
+
 warnings.filterwarnings('ignore')
 
 from augur.api.routes import AUGUR_API_VERSION
@@ -604,6 +610,7 @@ def average_commits_per_PR():
         # opts = FirefoxOptions()
     # opts.add_argument("--headless")
     # driver = webdriver.Firefox(firefox_options=opts)
+    # filename = export_png(grid, timeout=180, webdriver=webdriver)
     filename = export_png(grid, timeout=180)
 
     return send_file(filename)

diff --git a/augur/api/view/init.py b/augur/api/view/init.py
@@ -91,4 +91,4 @@ def write_settings(current_settings):
 # Initialize logging
 def init_logging():
     global logger
-    logger = AugurLogger("augur_view", reset_logfiles=True).get_logger()
+    logger = AugurLogger("augur_view", reset_logfiles=False).get_logger()
diff --git a/augur/application/cli/__init__.py b/augur/application/cli/__init__.py
@@ -32,7 +32,7 @@ def new_func(ctx, *args, **kwargs):
                 You are not connected to the internet.\n \
                 Please connect to the internet to run Augur\n \
                 Consider setting http_proxy variables for limited access installations.")
-            sys.exit()        
+            sys.exit(-1)        
 
     return update_wrapper(new_func, function_internet_connection)
 
@@ -78,7 +78,7 @@ def new_func(ctx, *args, **kwargs):
                 print(f"\n\n{usage} command setup failed\nERROR: connecting to database\nHINT: The {incorrect_values} may be incorrectly specified in {location}\n")
 
             engine.dispose()
-            sys.exit()
+            sys.exit(-2)
 
     return update_wrapper(new_func, function_db_connection)
 

diff --git a/augur/application/cli/_multicommand.py b/augur/application/cli/_multicommand.py
@@ -30,7 +30,6 @@ def get_command(self, ctx, name):
 
         # Check that the command exists before importing
         if not cmdfile.is_file():
-
             return
 
         # Prefer to raise exception instead of silcencing it

diff --git a/augur/application/cli/api.py b/augur/application/cli/api.py
@@ -14,15 +14,16 @@
 
 from augur.application.db.session import DatabaseSession
 from augur.application.logs import AugurLogger
-from augur.application.cli import test_connection, test_db_connection, with_database
+from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext
 from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages
 from augur.application.db.lib import get_value
 
-logger = AugurLogger("augur", reset_logfiles=True).get_logger()
+logger = AugurLogger("augur", reset_logfiles=False).get_logger()
 
 @click.group('api', short_help='Commands for controlling the backend API server')
-def cli():
-    pass
+@click.pass_context
+def cli(ctx):
+    ctx.obj = DatabaseContext()
 
 @cli.command("start")
 @click.option("--development", is_flag=True, default=False, help="Enable development mode")

diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py
@@ -47,8 +47,8 @@ def cli(ctx):
 @click.pass_context
 def start(ctx, disable_collection, development, pidfile, port):
     """Start Augur's backend server."""
-    with open(pidfile, "w") as pidfile:
-        pidfile.write(str(os.getpid()))
+    with open(pidfile, "w") as pidfile_io:
+        pidfile_io.write(str(os.getpid()))
 
     try:
         if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1":
@@ -63,6 +63,8 @@ def start(ctx, disable_collection, development, pidfile, port):
     if development:
         os.environ["AUGUR_DEV"] = "1"
         logger.info("Starting in development mode")
+
+    os.environ["AUGUR_PIDFILE"] = pidfile
 
     try:
         gunicorn_location = os.getcwd() + "/augur/api/gunicorn_conf.py"
@@ -74,6 +76,11 @@ def start(ctx, disable_collection, development, pidfile, port):
     if not port:
         port = get_value("Server", "port")
 
+    os.environ["AUGUR_PORT"] = str(port)
+
+    if disable_collection:
+        os.environ["AUGUR_DISABLE_COLLECTION"] = "1"
+
     worker_vmem_cap = get_value("Celery", 'worker_process_vmem_cap')
 
     gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log"
@@ -128,7 +135,7 @@ def start(ctx, disable_collection, development, pidfile, port):
         augur_collection_monitor.si().apply_async()
 
     else:
-        logger.info("Collection disabled")   
+        logger.info("Collection disabled")
 
     try:
         server.wait()
@@ -153,6 +160,8 @@ def start(ctx, disable_collection, development, pidfile, port):
                 cleanup_after_collection_halt(logger, ctx.obj.engine)
             except RedisConnectionError:
                 pass
+
+    os.unlink(pidfile)
 
 def start_celery_worker_processes(vmem_cap_ratio, disable_collection=False):
 
@@ -185,7 +194,7 @@ def determine_worker_processes(ratio,maximum):
         sleep_time += 6
 
         #60% of estimate, Maximum value of 45 : Reduced because it can be lower
-        core_num_processes = determine_worker_processes(.40, 50)
+        core_num_processes = determine_worker_processes(.40, 90)
         logger.info(f"Starting core worker processes with concurrency={core_num_processes}")
         core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h"
         process_list.append(subprocess.Popen(core_worker.split(" ")))
@@ -224,6 +233,54 @@ def stop(ctx):
 
     augur_stop(signal.SIGTERM, logger, ctx.obj.engine)
 
+@cli.command('stop-collection-blocking')
+@test_connection
+@test_db_connection
+@with_database
+@click.pass_context
+def stop_collection(ctx):
+    """
+    Stop collection tasks if they are running, block until complete
+    """
+    processes = get_augur_processes()
+
+    stopped = []
+
+    p: psutil.Process
+    for p in processes:
+        if p.name() == "celery":
+            stopped.append(p)
+            p.terminate()
+
+    if not len(stopped):
+        logger.info("No collection processes found")
+        return
+
+    _, alive = psutil.wait_procs(stopped, 5,
+                                 lambda p: logger.info(f"STOPPED: {p.pid}"))
+
+    killed = []
+    while True:
+        for i in range(len(alive)):
+            if alive[i].status() == psutil.STATUS_ZOMBIE:
+                logger.info(f"KILLING ZOMBIE: {alive[i].pid}")
+                alive[i].kill()
+                killed.append(i)
+            elif not alive[i].is_running():
+                logger.info(f"STOPPED: {p.pid}")
+                killed.append(i)
+
+        for i in reversed(killed):
+            alive.pop(i)
+
+        if not len(alive):
+            break
+
+        logger.info(f"Waiting on [{', '.join(str(p.pid for p in alive))}]")
+        time.sleep(0.5)
+
+    cleanup_after_collection_halt(logger, ctx.obj.engine)
+
 @cli.command('kill')
 @test_connection
 @test_db_connection
@@ -388,7 +445,7 @@ def processes():
     Outputs the name/PID of all Augur server & worker processes"""
     augur_processes = get_augur_processes()
     for process in augur_processes:
-        logger.info(f"Found process {process.pid}")
+        logger.info(f"Found process {process.pid} [{process.name()}] -> Parent: {process.parent().pid}")
 
 def get_augur_processes():
     augur_processes = []