diff --git a/dvc/analytics.py b/dvc/analytics.py index 8b39fc31b8..1ae02f60ac 100644 --- a/dvc/analytics.py +++ b/dvc/analytics.py @@ -1,256 +1,167 @@ -"""Collect and send usage analytics""" -from __future__ import unicode_literals - -import errno import json import logging import os +import platform +import requests +import sys +import tempfile +import uuid + +import distro from dvc import __version__ -from dvc.utils import env2bool -from dvc.utils.compat import str +from dvc.config import Config, to_bool +from dvc.daemon import daemon +from dvc.exceptions import NotDvcRepoError +from dvc.lock import Lock, LockError +from dvc.repo import Repo +from dvc.scm import SCM +from dvc.utils import env2bool, is_binary, makedirs +from dvc.utils.compat import str, FileNotFoundError logger = logging.getLogger(__name__) -class Analytics(object): - """Class for collecting and sending usage analytics. - - Args: - info (dict): optional existing analytics report. +def collect_and_send_report(args=None, return_code=None): """ + Collect information from the runtime/environment and the command + being executed into a report and send it over the network. - URL = "https://analytics.dvc.org" - TIMEOUT_POST = 5 + To prevent analytics from blocking the execution of the main thread, + sending the report is done in a separate process. - USER_ID_FILE = "user_id" + The inter-process communication happens through a file containing the + report as a JSON, where the _collector_ generates it and the _sender_ + removes it after sending it. + """ + report = _runtime_info() - PARAM_DVC_VERSION = "dvc_version" - PARAM_USER_ID = "user_id" - PARAM_SYSTEM_INFO = "system_info" + # Include command execution information on the report only when available. + if args and hasattr(args, "func"): + report.update({"cmd_class": args.func.__name__}) - PARAM_OS = "os" + if return_code is not None: + report.update({"cmd_return_code": return_code}) - PARAM_WINDOWS_VERSION_MAJOR = "windows_version_major" - PARAM_WINDOWS_VERSION_MINOR = "windows_version_minor" - PARAM_WINDOWS_VERSION_BUILD = "windows_version_build" - PARAM_WINDOWS_VERSION_SERVICE_PACK = "windows_version_service_pack" + with tempfile.NamedTemporaryFile(delete=False, mode="w") as fobj: + json.dump(report, fobj) + daemon(["analytics", fobj.name]) - PARAM_MAC_VERSION = "mac_version" - PARAM_LINUX_DISTRO = "linux_distro" - PARAM_LINUX_DISTRO_VERSION = "linux_distro_version" - PARAM_LINUX_DISTRO_LIKE = "linux_distro_like" +def is_enabled(): + if env2bool("DVC_TEST"): + return False - PARAM_SCM_CLASS = "scm_class" - PARAM_IS_BINARY = "is_binary" - PARAM_CMD_CLASS = "cmd_class" - PARAM_CMD_RETURN_CODE = "cmd_return_code" + enabled = to_bool( + Config(validate=False) + .config.get(Config.SECTION_CORE, {}) + .get(Config.SECTION_CORE_ANALYTICS, "true") + ) - def __init__(self, info=None): - from dvc.config import Config - from dvc.lock import Lock + logger.debug("Analytics is {}abled.".format("en" if enabled else "dis")) - if info is None: - info = {} + return enabled - self.info = info - cdir = Config.get_global_config_dir() - try: - os.makedirs(cdir) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise +def send(report): + """ + Side effect: Removes the report after sending it. - self.user_id_file = os.path.join(cdir, self.USER_ID_FILE) - self.user_id_file_lock = Lock(self.user_id_file + ".lock") + The report is generated and stored in a temporary file, see: + `collect_and_send_report`. Sending happens on another process, + thus, the need of removing such file afterwards. + """ + url = "https://analytics.dvc.org" + headers = {"content-type": "application/json"} - @staticmethod - def load(path): - """Loads analytics report from json file specified by path. + with open(report, "rb") as fobj: + requests.post(url, data=fobj, headers=headers, timeout=5) - Args: - path (str): path to json file with analytics report. - """ - with open(path, "r") as fobj: - analytics = Analytics(info=json.load(fobj)) - os.unlink(path) - return analytics + os.remove(report) - def _write_user_id(self): - import uuid - with open(self.user_id_file, "w+") as fobj: - user_id = str(uuid.uuid4()) - info = {self.PARAM_USER_ID: user_id} - json.dump(info, fobj) - return user_id +def _scm_in_use(): + try: + scm = SCM(root_dir=Repo.find_root()) + return type(scm).__name__ + except NotDvcRepoError: + pass - def _read_user_id(self): - if not os.path.exists(self.user_id_file): - return None - with open(self.user_id_file, "r") as fobj: - try: - info = json.load(fobj) - except ValueError as exc: - logger.debug("Failed to load user_id: {}".format(exc)) - return None - - return info[self.PARAM_USER_ID] - - def _get_user_id(self): - from dvc.lock import LockError +def _runtime_info(): + """ + Gather information from the environment where DVC runs to fill a report. + """ + return { + "dvc_version": __version__, + "is_binary": is_binary(), + "scm_class": _scm_in_use(), + "system_info": _system_info(), + "user_id": _find_or_create_user_id(), + } - try: - with self.user_id_file_lock: - user_id = self._read_user_id() - if user_id is None: - user_id = self._write_user_id() - return user_id - except LockError: - msg = "Failed to acquire '{}'" - logger.debug(msg.format(self.user_id_file_lock.lockfile)) - - def _collect_windows(self): - import sys - version = sys.getwindowsversion() # pylint: disable=no-member - info = {} - info[self.PARAM_OS] = "windows" - info[self.PARAM_WINDOWS_VERSION_MAJOR] = version.major - info[self.PARAM_WINDOWS_VERSION_MINOR] = version.minor - info[self.PARAM_WINDOWS_VERSION_BUILD] = version.build - info[self.PARAM_WINDOWS_VERSION_SERVICE_PACK] = version.service_pack - return info - - def _collect_darwin(self): - import platform - - info = {} - info[self.PARAM_OS] = "mac" - info[self.PARAM_MAC_VERSION] = platform.mac_ver()[0] - return info - - def _collect_linux(self): - import distro - - info = {} - info[self.PARAM_OS] = "linux" - info[self.PARAM_LINUX_DISTRO] = distro.id() - info[self.PARAM_LINUX_DISTRO_VERSION] = distro.version() - info[self.PARAM_LINUX_DISTRO_LIKE] = distro.like() - return info - - def _collect_system_info(self): - import platform - - system = platform.system() +def _system_info(): + system = platform.system() - if system == "Windows": - return self._collect_windows() + if system == "Windows": + version = sys.getwindowsversion() - if system == "Darwin": - return self._collect_darwin() - - if system == "Linux": - return self._collect_linux() - - raise NotImplementedError - - def collect(self): - """Collect analytics report.""" - from dvc.scm import SCM - from dvc.utils import is_binary - from dvc.repo import Repo - from dvc.exceptions import NotDvcRepoError - - self.info[self.PARAM_DVC_VERSION] = __version__ - self.info[self.PARAM_IS_BINARY] = is_binary() - self.info[self.PARAM_USER_ID] = self._get_user_id() - - self.info[self.PARAM_SYSTEM_INFO] = self._collect_system_info() - - try: - scm = SCM(root_dir=Repo.find_root()) - self.info[self.PARAM_SCM_CLASS] = type(scm).__name__ - except NotDvcRepoError: - pass - - def collect_cmd(self, args, ret): - """Collect analytics info from a CLI command.""" - from dvc.command.daemon import CmdDaemonAnalytics - - assert isinstance(ret, int) or ret is None - - if ret is not None: - self.info[self.PARAM_CMD_RETURN_CODE] = ret - - if args is not None and hasattr(args, "func"): - assert args.func != CmdDaemonAnalytics - self.info[self.PARAM_CMD_CLASS] = args.func.__name__ - - def dump(self): - """Save analytics report to a temporary file. - - Returns: - str: path to the temporary file that contains the analytics report. - """ - import tempfile + return { + "os": "windows", + "windows_version_build": version.build, + "windows_version_major": version.major, + "windows_version_minor": version.minor, + "windows_version_service_pack": version.service_pack, + } - with tempfile.NamedTemporaryFile(delete=False, mode="w") as fobj: - json.dump(self.info, fobj) - return fobj.name + if system == "Darwin": + return {"os": "mac", "mac_version": platform.mac_ver()[0]} - @staticmethod - def is_enabled(cmd=None): - from dvc.config import Config, to_bool - from dvc.command.daemon import CmdDaemonBase + if system == "Linux": + return { + "os": "linux", + "linux_distro": distro.id(), + "linux_distro_like": distro.like(), + "linux_distro_version": distro.version(), + } - if env2bool("DVC_TEST"): - return False + # We don't collect data for any other system. + raise NotImplementedError - if isinstance(cmd, CmdDaemonBase): - return False - core = Config(validate=False).config.get(Config.SECTION_CORE, {}) - enabled = to_bool(core.get(Config.SECTION_CORE_ANALYTICS, "true")) - logger.debug( - "Analytics is {}.".format("enabled" if enabled else "disabled") - ) - return enabled +def _find_or_create_user_id(): + """ + The user's ID is stored on a file under the global config directory. - @staticmethod - def send_cmd(cmd, args, ret): - """Collect and send analytics for CLI command. + The file should contain a JSON with a "user_id" key: - Args: - args (list): parsed args for the CLI command. - ret (int): return value of the CLI command. - """ - from dvc.daemon import daemon + {"user_id": "16fd2706-8baf-433b-82eb-8c7fada847da"} - if not Analytics.is_enabled(cmd): - return + IDs are generated randomly with UUID. + """ + config_dir = Config.get_global_config_dir() + fname = os.path.join(config_dir, "user_id") + lockfile = os.path.join(config_dir, "user_id.lock") - analytics = Analytics() - analytics.collect_cmd(args, ret) - daemon(["analytics", analytics.dump()]) + # Since the `fname` and `lockfile` are under the global config, + # we need to make sure such directory exist already. + makedirs(config_dir, exist_ok=True) - def send(self): - """Collect and send analytics.""" - import requests + try: + with Lock(lockfile): + try: + with open(fname, "r") as fobj: + user_id = json.load(fobj)["user_id"] - if not self.is_enabled(): - return + except (FileNotFoundError, ValueError, KeyError): + user_id = str(uuid.uuid4()) - self.collect() + with open(fname, "w") as fobj: + json.dump({"user_id": user_id}, fobj) - logger.debug("Sending analytics: {}".format(self.info)) + return user_id - try: - requests.post(self.URL, json=self.info, timeout=self.TIMEOUT_POST) - except requests.exceptions.RequestException as exc: - logger.debug("Failed to send analytics: {}".format(str(exc))) + except LockError: + logger.debug("Failed to acquire {lockfile}".format(lockfile=lockfile)) diff --git a/dvc/command/daemon.py b/dvc/command/daemon.py index 2b60bded4e..da92105847 100644 --- a/dvc/command/daemon.py +++ b/dvc/command/daemon.py @@ -24,10 +24,9 @@ def run(self): class CmdDaemonAnalytics(CmdDaemonBase): def run(self): - from dvc.analytics import Analytics + from dvc import analytics - analytics = Analytics.load(self.args.target) - analytics.send() + analytics.send(self.args.target) return 0 diff --git a/dvc/config.py b/dvc/config.py index b883ef259b..62823deb6c 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -13,8 +13,7 @@ from dvc.exceptions import DvcException from dvc.exceptions import NotDvcRepoError -from dvc.utils.compat import open -from dvc.utils.compat import str +from dvc.utils.compat import open, str logger = logging.getLogger(__name__) diff --git a/dvc/main.py b/dvc/main.py index 660f9420d4..86e1a9e0df 100644 --- a/dvc/main.py +++ b/dvc/main.py @@ -3,7 +3,7 @@ import logging -from dvc.analytics import Analytics +from dvc import analytics from dvc.cli import parse_args from dvc.config import ConfigError from dvc.exceptions import DvcParserError @@ -81,6 +81,7 @@ def main(argv=None): if ret != 0: logger.info(FOOTER) - Analytics().send_cmd(cmd, args, ret) + if analytics.is_enabled(): + analytics.collect_and_send_report(args, ret) return ret diff --git a/dvc/repo/init.py b/dvc/repo/init.py index e06307e0a1..5df1d0f1f0 100644 --- a/dvc/repo/init.py +++ b/dvc/repo/init.py @@ -3,7 +3,7 @@ import colorama -from dvc.analytics import Analytics +from dvc import analytics from dvc.config import Config from dvc.exceptions import InitError from dvc.repo import Repo @@ -17,7 +17,7 @@ def _welcome_message(): - if Analytics.is_enabled(): + if analytics.is_enabled(): logger.info( boxify( "DVC has enabled anonymous aggregate usage analytics.\n" diff --git a/tests/func/test_analytics.py b/tests/func/test_analytics.py index b42a117270..a2eccfd62f 100644 --- a/tests/func/test_analytics.py +++ b/tests/func/test_analytics.py @@ -1,65 +1,20 @@ -import os - import mock -import requests -from dvc.analytics import Analytics from dvc.main import main -from tests.basic_env import TestDir -from tests.basic_env import TestDvc -from tests.basic_env import TestGit - - -def _clean_getenv(key, default=None): - """ - Remove env vars that affect dvc behavior in tests - """ - if key in ["DVC_TEST", "CI"]: - return None - return os.environ.get(key, default) - - -class TestAnalytics(TestDir): - def test(self): - a = Analytics() - a.collect() - self.assertTrue(isinstance(a.info, dict)) - self.assertNotEqual(a.info, {}) - self.assertTrue(a.PARAM_USER_ID in a.info.keys()) - self.assertTrue(a.PARAM_SYSTEM_INFO in a.info.keys()) - self.assertNotEqual(a.info[a.PARAM_SYSTEM_INFO], {}) - - @mock.patch.object(os, "getenv", new=_clean_getenv) - @mock.patch("requests.post") - def test_send(self, mockpost): - ret = main(["daemon", "analytics", Analytics().dump(), "-v"]) - self.assertEqual(ret, 0) - - self.assertTrue(mockpost.called) - - @mock.patch.object(os, "getenv", new=_clean_getenv) - @mock.patch.object( - requests, "post", side_effect=requests.exceptions.RequestException() - ) - def test_send_failed(self, mockpost): - ret = main(["daemon", "analytics", Analytics().dump(), "-v"]) - self.assertEqual(ret, 0) - - self.assertTrue(mockpost.called) - +from dvc.utils.compat import fspath -class TestAnalyticsGit(TestAnalytics, TestGit): - pass +@mock.patch("dvc.analytics.send") +def test_daemon_analytics(mock_send, tmp_path): + report = fspath(tmp_path) + assert 0 == main(["daemon", "analytics", report]) -class TestAnalyticsDvc(TestAnalytics, TestDvc): - @mock.patch("requests.post") - def test_send_disabled(self, mockpost): - ret = main(["config", "core.analytics", "false"]) - self.assertEqual(ret, 0) + mock_send.assert_called_with(report) - with mock.patch.object(os, "getenv", new=_clean_getenv): - ret = main(["daemon", "analytics", Analytics().dump(), "-v"]) - self.assertEqual(ret, 0) - self.assertFalse(mockpost.called) +@mock.patch("dvc.analytics.collect_and_send_report") +@mock.patch("dvc.analytics.is_enabled", return_value=True) +def test_main_analytics(mock_is_enabled, mock_report, dvc_repo): + assert 0 == main(["add", "foo"]) + assert mock_is_enabled.called + assert mock_report.called diff --git a/tests/unit/test_analytics.py b/tests/unit/test_analytics.py index 9e7e2dab14..43eb37dcbc 100644 --- a/tests/unit/test_analytics.py +++ b/tests/unit/test_analytics.py @@ -1,6 +1,80 @@ import pytest +import mock +import platform +import json -from dvc.analytics import Analytics +from voluptuous import Schema, Any + +from dvc import analytics +from dvc.cli import parse_args +from dvc.utils.compat import str, builtin_str + + +string = Any(str, builtin_str) + + +@pytest.fixture +def tmp_global_config(tmp_path): + """ + Fixture to prevent modifying the actual global config + """ + with mock.patch( + "dvc.config.Config.get_global_config_dir", return_value=str(tmp_path) + ): + yield + + +@mock.patch("dvc.daemon._spawn") +@mock.patch("json.dump") +def test_collect_and_send_report(mock_json, mock_daemon, tmp_global_config): + analytics.collect_and_send_report() + report = mock_json.call_args[0][0] + + with pytest.raises(KeyError): + report["cmd_class"] + + with pytest.raises(KeyError): + report["cmd_return_code"] + + args = parse_args(["add", "foo"]) + return_code = 0 + + analytics.collect_and_send_report(args, return_code) + report = mock_json.call_args[0][0] + + assert report["cmd_class"] == "CmdAdd" + assert report["cmd_return_code"] == return_code + + assert mock_daemon.call_count == 2 + + +def test_runtime_info(tmp_global_config): + schema = Schema( + { + "dvc_version": string, + "is_binary": bool, + "scm_class": Any("Git", None), + "user_id": string, + "system_info": dict, + }, + required=True, + ) + + assert schema(analytics._runtime_info()) + + +@mock.patch("requests.post") +def test_send(mock_post, tmp_path): + url = "https://analytics.dvc.org" + report = {"name": "dummy report"} + fname = str(tmp_path / "report") + + with open(fname, "w") as fobj: + json.dump(report, fobj) + + analytics.send(fname) + assert mock_post.called + assert mock_post.call_args.args[0] == url @pytest.mark.parametrize( @@ -13,7 +87,7 @@ ({"analytics": "false", "unknown": "broken"}, False), ], ) -def test_is_enabled(dvc_repo, config, result, monkeypatch): +def test_is_enabled(dvc_repo, config, result, monkeypatch, tmp_global_config): configobj = dvc_repo.config._repo_config configobj["core"] = config configobj.write() @@ -21,4 +95,41 @@ def test_is_enabled(dvc_repo, config, result, monkeypatch): # reset DVC_TEST env var, which affects `is_enabled()` monkeypatch.delenv("DVC_TEST") - assert result == Analytics.is_enabled() + assert result == analytics.is_enabled() + + +def test_system_info(): + schema = Schema({"os": Any("windows", "mac", "linux")}, required=True) + + system = platform.system() + + if system == "Windows": + schema = schema.extend( + { + "windows_version_build": int, + "windows_version_major": int, + "windows_version_minor": int, + "windows_version_service_pack": string, + } + ) + + if system == "Darwin": + schema = schema.extend({"mac_version": string}) + + if system == "Linux": + schema = schema.extend( + { + "linux_distro": string, + "linux_distro_like": string, + "linux_distro_version": string, + } + ) + + assert schema(analytics._system_info()) + + +def test_find_or_create_user_id(tmp_global_config): + created = analytics._find_or_create_user_id() + found = analytics._find_or_create_user_id() + + assert created == found