Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UserAgentParser parsing #10114

Merged
merged 6 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/StatsLogParser/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Debug User Agent Test Module",
"type": "debugpy",
"request": "launch",
"module": "tests.test_useragentparser"
},
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}
1 change: 1 addition & 0 deletions python/StatsLogParser/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include loginterpretation/knownclients.yaml
2 changes: 2 additions & 0 deletions python/StatsLogParser/StatsLogParser.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
<SubType>Code</SubType>
</Compile>
<Compile Include="loginterpretation\semanticversion.py" />
<Compile Include="loginterpretation\useragentparser.py" />
<Compile Include="loginterpretation\__init__.py">
<SubType>Code</SubType>
</Compile>
Expand All @@ -51,6 +52,7 @@
</Interpreter>
</ItemGroup>
<ItemGroup>
<Content Include="loginterpretation\knownclients.yaml" />
<Content Include="requirements.txt" />
</ItemGroup>
<ItemGroup>
Expand Down
3 changes: 0 additions & 3 deletions python/StatsLogParser/loginterpretation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
"""
Log interpretation package
"""
222 changes: 222 additions & 0 deletions python/StatsLogParser/loginterpretation/knownclients.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
user_agent_parsers:
# NuGet MSBuild Task
- regex: '(NuGet MSBuild Task)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet MSBuild Task'

- regex: '(NuGet .NET Core MSBuild Task)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet .NET Core MSBuild Task'

- regex: '(NuGet Desktop MSBuild Task)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Desktop MSBuild Task'

# NuGet Client V3
- regex: '(NuGet Client V3)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Client V3'

# NuGet VS PowerShell Console (NuGet 2.8+)
- regex: '(NuGet VS PowerShell Console)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet VS PowerShell Console'

# NuGet VS Packages Dialog - Solution (NuGet 2.8+)
- regex: '(NuGet VS Packages Dialog - Solution)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet VS Packages Dialog - Solution'

# NuGet VS Packages Dialog (NuGet 2.8+)
- regex: '(NuGet VS Packages Dialog)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet VS Packages Dialog - Solution'

# NuGet Add Package Dialog (pre-NuGet 2.8)
- regex: '(NuGet Add Package Dialog)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Add Package Dialog'

# NuGet Package Manager Console (pre-NuGet 2.8)
- regex: '(NuGet Package Manager Console)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Package Manager Console'

# NuGet Visual Studio Extension (pre-NuGet 2.8)
- regex: '(NuGet Visual Studio Extension)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Visual Studio Extension'

# Package-Installer (pre-NuGet 2.8)
- regex: '(Package-Installer)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Package-Installer'

# NuGet Command Line
- regex: '(NuGet Command Line)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Command Line'

# NuGet xplat CLI
- regex: '(NuGet xplat)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Cross-Platform Command Line'

# NuGet Core
- regex: '(NuGet Core)/?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet'

# WebMatrix includes its own core version number as part of the client name, before the slash
- regex: '(WebMatrix) (\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'WebMatrix'

# NuGet Package Explorer (npe.codeplex.com)
- regex: '(NuGet Package Explorer Metro)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Package Explorer Metro'
- regex: '(NuGet Package Explorer)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet Package Explorer'

# JetBrains TeamCity - uses a space to separate the client from the version instead of slash
- regex: '(JetBrains TeamCity) (\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'JetBrains TeamCity'

# JetBrains ReSharper Platform for VS10
- regex: '(ReSharperPlatformVs10)/?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'JetBrains ReSharper Platform VS2010'

# JetBrains ReSharper Platform for VS11
- regex: '(ReSharperPlatformVs11)/?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'JetBrains ReSharper Platform VS2012'

# JetBrains ReSharper Platform for VS12
- regex: '(ReSharperPlatformVs12)/?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'JetBrains ReSharper Platform VS2013'

# JetBrains ReSharper Platform for VS14
- regex: '(ReSharperPlatformVs14)/?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'JetBrains ReSharper Platform VS2015'

# JetBrains ReSharper
- regex: '(ReSharper)/(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'JetBrains ReSharper'

# JetBrains dotPeek
- regex: '(dotPeek)/(\d+)\.(\d+)\.(\d+)\.?(\d+)?\)'
family_replacement: 'JetBrains dotPeek'

# JetBrains ReSharper Extension Manager
- regex: '(ReSharper Extension Manager)/(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'JetBrains ReSharper Extension Manager'

# Sonatype Nexus (www.sonatype.com)
- regex: '(Nexus)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Sonatype Nexus'

# JFrog Artifactory (www.jfrog.com)
- regex: '(Artifactory)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'JFrog Artifactory'

# MyGet (www.myget.org)
- regex: '(MyGet)/?(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'MyGet'

# ProGet (www.inedo.com)
- regex: '(ProGet)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Inedo ProGet'

# Paket (http://fsprojects.github.io/Paket)
- regex: '(Paket)/?(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'Paket'

# Xamarin Studio (www.xamarin.com)
- regex: '(Xamarin Studio)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Xamarin Studio'

# MonoDevelop
- regex: '(MonoDevelop)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'MonoDevelop'

# MonoDevelop-Unity
- regex: '(MonoDevelop-Unity)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'MonoDevelop'

# SharpDevelop
- regex: '(SharpDevelop)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'SharpDevelop'

# DNX
- regex: '(Microsoft_.NET_Development_Utility)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'DNX Utility'
- regex: '(NuGet Shim)/?(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'NuGet Shim'

# PowerShell
- regex: '(WindowsPowerShell)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Windows PowerShell'

# PowerShell Core
- regex: 'Mozilla.*(PowerShell)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'PowerShell Core'

# Fiddler
- regex: '(Fiddler)/?(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'Fiddler'

# curl
- regex: '(curl)/?(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'curl'

# Java
- regex: '(Java)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Java'

# NuGet Test Client - to be used when making test-calls to nuget.org endpoints
- regex: '(NuGet Test Client)/?(\d+)?\.?(\d+)?\.?(\d+)?'
family_replacement: 'NuGet Test Client'

# Cake NuGet Client
- regex: '(Cake NuGet Client)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Cake NuGet Client'

# Cake
- regex: '(Cake)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Cake'

# NuGet Client V3
- regex: '(NuGet VS VSIX)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet VS VSIX'

# Xamarin Updater
- regex: '(Xamarin Updater).*?\(Version: (\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Xamarin Updater'

# vsts-task-installer
- regex: '(vsts-task-installer)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'vsts-task-installer'

# Checkmarx 1
- regex: '(Checkmarx-NugetSourceCodePriorityCollector)'
family_replacement: 'Checkmarx NugetSourceCodePriorityCollector'

# Checkmarx 2
- regex: '(Checkmarx-NugetShaCollector)'
family_replacement: 'Checkmarx NugetShaCollector'

# Checkmarx 3
- regex: '(Checkmarx-NugetDllShaCollector)'
family_replacement: 'Checkmarx NugetDllShaCollector'

# Checkmarx 4
- regex: '(Checkmarx-SourceCodeDownloader)'
family_replacement: 'Checkmarx SourceCodeDownloader'

# Azure artifacts
- regex: '(AzureArtifacts)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Azure artifacts'

# Bazel
- regex: '(Bazel)/.*?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Bazel'

# Visual Studio
- regex: '(Visual Studio)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'Visual Studio'

# NuGetMirror
- regex: '(NuGetMirror)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGetMirror'

# BaGet
- regex: '(BaGet)/(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'BaGet'

# NuGet - Keep this one at the bottom of this file as a catch-all resolver
- regex: '(NuGet)/?(\d+)\.(\d+)\.?(\d+)?'
family_replacement: 'NuGet'
122 changes: 122 additions & 0 deletions python/StatsLogParser/loginterpretation/useragentparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from __future__ import annotations
from collections import namedtuple
from typing import Optional
import re
import pkg_resources
from ua_parser import user_agent_parser
from ua_parser._regexes import USER_AGENT_PARSERS
import yaml

UserAgent = namedtuple('UserAgent', ['family', 'major', 'minor', 'patch'])

class UserAgentParser:
"""UserAgentParser class to parse user agent string."""
DEFAULT_PARSER_DATA = USER_AGENT_PARSERS
KNOWN_CLIENTS_DATA: list[user_agent_parser.UserAgentParser] = []
KNOWN_CLIENTS_IN_CHINA_DATA: list[user_agent_parser.UserAgentParser] = []

@classmethod
def __static_init__(cls):
cls.KNOWN_CLIENTS_DATA = cls._load_known_clients_parser()
cls.KNOWN_CLIENTS_IN_CHINA_DATA = cls._load_known_clients_in_china_parser()

@staticmethod
def _load_known_clients_parser():
yaml_content = UserAgentParser._read_known_clients_yaml()
return UserAgentParser._create_parser_data_from_yaml(yaml_content)

@staticmethod
def _load_known_clients_in_china_parser():
yaml_content = UserAgentParser._read_known_clients_yaml()
patched_yaml = UserAgentParser._add_support_for_china_cdn(yaml_content)
return UserAgentParser._create_parser_data_from_yaml(patched_yaml)

@staticmethod
def _add_support_for_china_cdn(yaml_content):
patched_yaml = re.sub(
r"(?:[:]\s'\()+([\w\-.\s]+)(?:\))+",
UserAgentParser._replace_whitespace_with_plus_sign,
yaml_content,
flags=re.DOTALL
)
return patched_yaml

@staticmethod
def _replace_whitespace_with_plus_sign(match):
return ": '(" + match.group(1).replace(" ", r"\+") + ")"

@staticmethod
def _read_known_clients_yaml() -> str:
file_name = pkg_resources.resource_filename(__name__, 'knownclients.yaml')
with open(file_name, 'r', encoding='utf-8-sig') as file:
yaml_file = file.read()

return yaml_file

@staticmethod
def _create_parser_data_from_yaml(yaml_content) -> list[user_agent_parser.UserAgentParser]:
data = yaml.safe_load(yaml_content)

parsers: list[user_agent_parser.UserAgentParser] = []

for parser in data["user_agent_parsers"]:
regex = parser["regex"]

family_replacement = parser.get("family_replacement")
v1_replacement = parser.get("v1_replacement")
v2_replacement = parser.get("v2_replacement")

parsers.append(
user_agent_parser.UserAgentParser(
regex, family_replacement, v1_replacement, v2_replacement
)
)

return parsers

_MAX_CACHE_SIZE = 200
_PARSE_CACHE: dict[str, UserAgent] = {}

@staticmethod
def _lookup(ua: str) -> Optional[UserAgent]:

entry = UserAgentParser._PARSE_CACHE.get(ua)
if entry is not None:
return entry

if len(UserAgentParser._PARSE_CACHE) >= UserAgentParser._MAX_CACHE_SIZE:
UserAgentParser._PARSE_CACHE.clear()

return None

@staticmethod
def parse(user_agent_string):
"""Parse using known clients parser, then known clients in China parser, then default parser."""
entry = UserAgentParser._lookup(user_agent_string)

if entry is not None:
return entry

# Try known clients parser
entry = UserAgentParser._parse_user_agent_with_parsers(user_agent_string, UserAgentParser.KNOWN_CLIENTS_DATA)

if entry.family.lower() == 'other': # Try China parser
entry = UserAgentParser._parse_user_agent_with_parsers(user_agent_string, UserAgentParser.KNOWN_CLIENTS_IN_CHINA_DATA)

if entry.family.lower() == 'other': # Try default parser
entry = UserAgentParser._parse_user_agent_with_parsers(user_agent_string, UserAgentParser.DEFAULT_PARSER_DATA)

UserAgentParser._PARSE_CACHE[user_agent_string] = entry
return entry

@staticmethod
def _parse_user_agent_with_parsers(user_agent_string: str, parsers: list[user_agent_parser.UserAgentParser]) -> UserAgent:
for ua_parser in parsers:
family, v1, v2, v3 = ua_parser.Parse(user_agent_string)
if family:
break

family = family or "Other"
return UserAgent(family, v1 or None, v2 or None, v3 or None)

UserAgentParser.__static_init__()
Loading