Skip to content

Commit

Permalink
Merge pull request #2898 from chaoss/augur-release-0.76.2
Browse files Browse the repository at this point in the history
Augur release 0.76.2
  • Loading branch information
sgoggins authored Sep 25, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents fe11b0e + ef39e84 commit c8eba65
Showing 55 changed files with 15,473 additions and 2,391 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@ augur_export_env.sh
config.yml
reports.yml
*.pid
*.sock

node_modules/
.idea/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.76.1
# Augur NEW Release v0.76.2

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
@@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
## NEW RELEASE ALERT!
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)

Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.1
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.2

- The `main` branch is a stable version of our new architecture, which features:
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
158 changes: 158 additions & 0 deletions augur/api/metrics/deps.py
Original file line number Diff line number Diff line change
@@ -77,4 +77,162 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No
return results


@register_metric()
def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None):
"""
Returns a list of all the dependencies in a project/repo/repo_group.
:param repo_id: The repository's id
:param repo_group_id: The repository's group id
:param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day'
:param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:00'
:param end_date: Specifies the end date, defaults to datetime.now()
:return: DataFrame of persons/period
"""

if not begin_date:
begin_date = '1970-1-1 00:00:01'
if not end_date:
end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

if repo_id:

libyearSQL = s.sql.text("""
SELECT
rg_name,
repo_group_id,
repo_name,
d.repo_id,
repo_git,
forked_from,
repo_archived,
c.name,
c.libyear,
MAX ( C.data_collection_date ) AS most_recent_collection
FROM
(
SELECT A.rg_name AS rg_name,
A.repo_group_id AS repo_group_id,
b.repo_name AS repo_name,
b.repo_id AS repo_id,
b.repo_git AS repo_git,
b.forked_from AS forked_from,
b.repo_archived AS repo_archived
FROM
repo_groups A,
repo b
WHERE
A.repo_group_id = b.repo_group_id
ORDER BY
rg_name,
repo_name
) d,
(
SELECT DISTINCT
f.repo_id,
f.NAME,
f.libyear,
f.data_collection_date
FROM
( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e,
augur_data.repo_deps_libyear f
WHERE
e.data_collection_date = f.data_collection_date and
e.repo_id = f.repo_id
ORDER BY
NAME
) C
WHERE
d.repo_id = C.repo_id
AND C.repo_id = :repo_id
GROUP BY
rg_name,
repo_git,
repo_group_id,
repo_name,
d.repo_id,
forked_from,
repo_archived,
c.name,
c.libyear
ORDER BY
repo_id;
""")

with current_app.engine.connect() as conn:
results = pd.read_sql(libyearSQL, conn, params={'repo_id': repo_id})

else:

libyearSQL = s.sql.text("""
Select w.* from
(
SELECT
rg_name,
repo_group_id,
repo_name,
d.repo_id,
repo_git,
forked_from,
repo_archived,
c.name,
c.libyear,
MAX ( C.data_collection_date ) AS most_recent_collection
FROM
(
SELECT A.rg_name AS rg_name,
A.repo_group_id AS repo_group_id,
b.repo_name AS repo_name,
b.repo_id AS repo_id,
b.repo_git AS repo_git,
b.forked_from AS forked_from,
b.repo_archived AS repo_archived
FROM
repo_groups A,
repo b
WHERE
A.repo_group_id = b.repo_group_id
ORDER BY
rg_name,
repo_name
) d,
(
SELECT DISTINCT
f.repo_id,
f.NAME,
f.libyear,
f.data_collection_date
FROM
( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e,
augur_data.repo_deps_libyear f
WHERE
e.data_collection_date = f.data_collection_date and
e.repo_id = f.repo_id
ORDER BY
NAME
) C
WHERE
d.repo_id = C.repo_id
GROUP BY
rg_name,
repo_git,
repo_group_id,
repo_name,
d.repo_id,
forked_from,
repo_archived,
c.name,
c.libyear
ORDER BY
repo_id) w,
repo_groups y,
repo z
where w.repo_id=z.repo_id and
y.repo_group_id=z.repo_group_id
and z.repo_group_id = :repo_group_id
""")

with current_app.engine.connect() as conn:
results = pd.read_sql(libyearSQL, conn, params={'repo_group_id': repo_group_id})
return results

7 changes: 7 additions & 0 deletions augur/api/routes/pull_request_reports.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,12 @@
from bokeh.models.glyphs import Rect
from bokeh.transform import dodge, factor_cmap, transform

# from selenium.webdriver import Firefox, FirefoxOptions
# options = FirefoxOptions()
# options.headless = True
# webdriver = Firefox(options=options)
#export_png(item, path, webdriver=webdriver)

warnings.filterwarnings('ignore')

from augur.api.routes import AUGUR_API_VERSION
@@ -604,6 +610,7 @@ def average_commits_per_PR():
# opts = FirefoxOptions()
# opts.add_argument("--headless")
# driver = webdriver.Firefox(firefox_options=opts)
# filename = export_png(grid, timeout=180, webdriver=webdriver)
filename = export_png(grid, timeout=180)

return send_file(filename)
2 changes: 1 addition & 1 deletion augur/api/view/init.py
Original file line number Diff line number Diff line change
@@ -91,4 +91,4 @@ def write_settings(current_settings):
# Initialize logging
def init_logging():
global logger
logger = AugurLogger("augur_view", reset_logfiles=True).get_logger()
logger = AugurLogger("augur_view", reset_logfiles=False).get_logger()
4 changes: 2 additions & 2 deletions augur/application/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@ def new_func(ctx, *args, **kwargs):
You are not connected to the internet.\n \
Please connect to the internet to run Augur\n \
Consider setting http_proxy variables for limited access installations.")
sys.exit()
sys.exit(-1)

return update_wrapper(new_func, function_internet_connection)

@@ -78,7 +78,7 @@ def new_func(ctx, *args, **kwargs):
print(f"\n\n{usage} command setup failed\nERROR: connecting to database\nHINT: The {incorrect_values} may be incorrectly specified in {location}\n")

engine.dispose()
sys.exit()
sys.exit(-2)

return update_wrapper(new_func, function_db_connection)

1 change: 0 additions & 1 deletion augur/application/cli/_multicommand.py
Original file line number Diff line number Diff line change
@@ -30,7 +30,6 @@ def get_command(self, ctx, name):

# Check that the command exists before importing
if not cmdfile.is_file():

return

# Prefer to raise exception instead of silcencing it
9 changes: 5 additions & 4 deletions augur/application/cli/api.py
Original file line number Diff line number Diff line change
@@ -14,15 +14,16 @@

from augur.application.db.session import DatabaseSession
from augur.application.logs import AugurLogger
from augur.application.cli import test_connection, test_db_connection, with_database
from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext
from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages
from augur.application.db.lib import get_value

logger = AugurLogger("augur", reset_logfiles=True).get_logger()
logger = AugurLogger("augur", reset_logfiles=False).get_logger()

@click.group('api', short_help='Commands for controlling the backend API server')
def cli():
pass
@click.pass_context
def cli(ctx):
ctx.obj = DatabaseContext()

@cli.command("start")
@click.option("--development", is_flag=True, default=False, help="Enable development mode")
67 changes: 62 additions & 5 deletions augur/application/cli/backend.py
Original file line number Diff line number Diff line change
@@ -47,8 +47,8 @@ def cli(ctx):
@click.pass_context
def start(ctx, disable_collection, development, pidfile, port):
"""Start Augur's backend server."""
with open(pidfile, "w") as pidfile:
pidfile.write(str(os.getpid()))
with open(pidfile, "w") as pidfile_io:
pidfile_io.write(str(os.getpid()))

try:
if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1":
@@ -63,6 +63,8 @@ def start(ctx, disable_collection, development, pidfile, port):
if development:
os.environ["AUGUR_DEV"] = "1"
logger.info("Starting in development mode")

os.environ["AUGUR_PIDFILE"] = pidfile

try:
gunicorn_location = os.getcwd() + "/augur/api/gunicorn_conf.py"
@@ -74,6 +76,11 @@ def start(ctx, disable_collection, development, pidfile, port):
if not port:
port = get_value("Server", "port")

os.environ["AUGUR_PORT"] = str(port)

if disable_collection:
os.environ["AUGUR_DISABLE_COLLECTION"] = "1"

worker_vmem_cap = get_value("Celery", 'worker_process_vmem_cap')

gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log"
@@ -128,7 +135,7 @@ def start(ctx, disable_collection, development, pidfile, port):
augur_collection_monitor.si().apply_async()

else:
logger.info("Collection disabled")
logger.info("Collection disabled")

try:
server.wait()
@@ -153,6 +160,8 @@ def start(ctx, disable_collection, development, pidfile, port):
cleanup_after_collection_halt(logger, ctx.obj.engine)
except RedisConnectionError:
pass

os.unlink(pidfile)

def start_celery_worker_processes(vmem_cap_ratio, disable_collection=False):

@@ -185,7 +194,7 @@ def determine_worker_processes(ratio,maximum):
sleep_time += 6

#60% of estimate, Maximum value of 45 : Reduced because it can be lower
core_num_processes = determine_worker_processes(.40, 50)
core_num_processes = determine_worker_processes(.40, 90)
logger.info(f"Starting core worker processes with concurrency={core_num_processes}")
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h"
process_list.append(subprocess.Popen(core_worker.split(" ")))
@@ -224,6 +233,54 @@ def stop(ctx):

augur_stop(signal.SIGTERM, logger, ctx.obj.engine)

@cli.command('stop-collection-blocking')
@test_connection
@test_db_connection
@with_database
@click.pass_context
def stop_collection(ctx):
"""
Stop collection tasks if they are running, block until complete
"""
processes = get_augur_processes()

stopped = []

p: psutil.Process
for p in processes:
if p.name() == "celery":
stopped.append(p)
p.terminate()

if not len(stopped):
logger.info("No collection processes found")
return

_, alive = psutil.wait_procs(stopped, 5,
lambda p: logger.info(f"STOPPED: {p.pid}"))

killed = []
while True:
for i in range(len(alive)):
if alive[i].status() == psutil.STATUS_ZOMBIE:
logger.info(f"KILLING ZOMBIE: {alive[i].pid}")
alive[i].kill()
killed.append(i)
elif not alive[i].is_running():
logger.info(f"STOPPED: {p.pid}")
killed.append(i)

for i in reversed(killed):
alive.pop(i)

if not len(alive):
break

logger.info(f"Waiting on [{', '.join(str(p.pid for p in alive))}]")
time.sleep(0.5)

cleanup_after_collection_halt(logger, ctx.obj.engine)

@cli.command('kill')
@test_connection
@test_db_connection
@@ -388,7 +445,7 @@ def processes():
Outputs the name/PID of all Augur server & worker processes"""
augur_processes = get_augur_processes()
for process in augur_processes:
logger.info(f"Found process {process.pid}")
logger.info(f"Found process {process.pid} [{process.name()}] -> Parent: {process.parent().pid}")

def get_augur_processes():
augur_processes = []
Loading

0 comments on commit c8eba65

Please sign in to comment.