Skip to content

Commit

Permalink
Collect memdumps for hung tests (#3552)
Browse files Browse the repository at this point in the history
Resolves #3298
  • Loading branch information
RussKie authored Jul 12, 2020
1 parent 720ef24 commit f88abd1
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 22 deletions.
159 changes: 159 additions & 0 deletions eng/MemDump.win.psm1
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
<#
Licensed to the .NET Foundation under one or more agreements. The .NET Foundation licenses this file to you under the MIT license.
See the LICENSE file in the project root for more information.
#>

# Collection of powershell build utility functions that we use across our scripts.
# Copied from https://raw.githubusercontent.com/dotnet/roslyn/3f851026b30b335ae328ce38a7817d93b96ad4ad/eng/build-utils-win.ps1

Set-StrictMode -version 2.0
$ErrorActionPreference = "Stop"
$FLAG_FILE = "memdump.lock"

. $PSScriptRoot\common\pipeline-logging-functions.ps1

function Get-Now {
return (Get-Date).ToString("yyyy-MM-yy HH:mm:ss")
}
function Start-MemDumpTimer() {
<#
.SYNOPSIS
Initiates the timer to monitor a process.
.DESCRIPTION
Starts process monitoring by setting a lock file in the specified directory.
If the target folder already contains a lock file, then it is assumed there is another
process that is monitoring processes, this is the end.
The monitoring is performed either until the lock file
in the target folder is deleted, or the allotted time has run out.
.PARAMETER WorkingDir
The full path to a repository root folder.
.PARAMETER TargetDir
The full path to a folder that contains the lock file.
.PARAMETER WaitSeconds
The time (in seconds) allowed for a process to run before it is considered unresponsive.
At which point a memory dump is captured, and the process is terminated
.INPUTS
None.
.OUTPUTS
None.
#>
Param (
[string] $WorkingDir,
[string] $TargetDir,
[int] $WaitSeconds
)

$flagFile = Join-Path $TargetDir $FLAG_FILE;
$logFile = $flagFile.Replace('.lock', '.log');
"[START] $WorkingDir, flag: $flagFile" >> $logFile;
Push-Location $WorkingDir

$hasFlagFile = Test-Path $flagFile
if ($hasFlagFile) {
Write-PipelineTaskError -Message "MemDump timer is already running!" -Type 'warning'
"[$(Get-Now)] MemDump timer is already running!" >> $logFile;
return;
}

'' | Out-File $flagFile

$processes = @();

do {
$hasFlagFile = Test-Path $flagFile
if (!$hasFlagFile) {
Write-PipelineTaskError -Message "MemDump timer is stopped" -Type 'warning'
"[$(Get-Now)] MemDump timer is stopped" >> $logFile;
return;
}

"[$(Get-Now)] MemDump wait for $WaitSeconds seconds" >> $logFile;
Start-Sleep -Seconds $WaitSeconds

Get-WmiObject win32_process -Filter "name like 'dotnet.exe'" | `
Where-Object { $_.CommandLine.Contains('System.Windows.Forms') } | `
Select-Object ProcessId, ProcessName, CommandLine | `
ForEach-Object {
$processes += $_;
}

# collect all memory dump before killing processes, as killing one process may lead to cascade kill
$processes | ForEach-Object {
$processId = $_.ProcessId;
$dumpFile = Join-Path $TargetDir "dotnet.exe.$processId.dmp";

"[$(Get-Now)] MemDump collect for pid: $processId, file: $dumpFile" >> $logFile;
try {
.\.tools\dotnet-dump collect --process-id $processId --type Heap --output $dumpFile
"[$(Get-Now)] MemDump collected!" >> $logFile;
}
catch {
$_ >> $logFile;
}
}

# kill all identified processes
$processes | ForEach-Object {
$processId = $_.ProcessId;
"[$(Get-Now)] MemDump kill pid: $processId" >> $logFile;
Stop-Process -Id $processId
"[$(Get-Now)] MemDump killed" >> $logFile;
}

$hasFlagFile = Test-Path $flagFile
}
until (!$hasFlagFile)
}

function Stop-MemDumpTimer() {
<#
.SYNOPSIS
Stop process monitoring by removing the lock file.
.DESCRIPTION
Stop process monitoring by removing the lock file.
.PARAMETER TargetDir
The full path to a folder that contains the lock file.
.INPUTS
None.
.OUTPUTS
None.
#>
Param (
[string] $TargetDir
)

$flagFile = Join-Path $TargetDir $FLAG_FILE;
$logFile = $flagFile.Replace('.lock', '.log');

$hasFlagFile = Test-Path $flagFile
if ($hasFlagFile) {
Remove-Item -Path $flagFile -Force
"[$(Get-Now)] Flag file removed" >> $logFile;
}

"[$(Get-Now)] Stopped`r`n`r`n" >> $logFile;
}

Export-ModuleMember -Function Start-MemDumpTimer
Export-ModuleMember -Function Stop-MemDumpTimer
22 changes: 13 additions & 9 deletions eng/Screenshots.win.psm1
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ Add-Type -AssemblyName 'System.Windows.Forms'

. $PSScriptRoot\common\pipeline-logging-functions.ps1

function Get-Now {
return (Get-Date).ToString("yyyy-MM-yy HH:mm:ss")
}

function Save-Screenshot() {
Param (
[string] $TargetFileName,
Expand Down Expand Up @@ -93,7 +97,8 @@ None.
#>
Param (
[string] $TargetDir
[string] $TargetDir,
[int] $WaitSeconds
)

$flagFile = Join-Path $TargetDir $FLAG_FILE;
Expand All @@ -103,7 +108,7 @@ None.
$hasFlagFile = Test-Path $flagFile
if ($hasFlagFile) {
Write-PipelineTaskError -Message "Screenshots are already being taken!" -Type 'warning'
"[START] Screenshots are already being taken!" >> $logFile;
"[$(Get-Now)] Screenshots are already being taken!" >> $logFile;
return;
}

Expand All @@ -114,13 +119,14 @@ None.
$hasFlagFile = Test-Path $flagFile
if (!$hasFlagFile) {
Write-PipelineTaskError -Message "Screenshots no longer being taken" -Type 'warning'
"[START] Screenshots no longer being taken" >> $logFile;
"[$(Get-Now)] Screenshots no longer being taken" >> $logFile;
return;
}

Start-Sleep -Seconds $WaitSeconds

Capture-Screenshot -TargetDir $TargetDir -LogFile $logFile;
"[START] Screenshot taken" >> $logFile;
Start-Sleep -Seconds 180
"[$(Get-Now)] Screenshot taken" >> $logFile;

$hasFlagFile = Test-Path $flagFile
}
Expand Down Expand Up @@ -156,16 +162,14 @@ None.

$flagFile = Join-Path $TargetDir $FLAG_FILE;
$logFile = $flagFile.Replace('.lock', '.log');
"[STOP] Flag: $flagFile" >> $logFile;

$hasFlagFile = Test-Path $flagFile

if ($hasFlagFile) {
Remove-Item -Path $flagFile -Force
"[STOP] Flag file removed" >> $logFile;
"[$(Get-Now)] Flag file removed" >> $logFile;
}

"[STOP] Stopped" >> $logFile;
"[$(Get-Now)] Stopped" >> $logFile;
}

Export-ModuleMember -Function Start-CaptureScreenshots
Expand Down
2 changes: 1 addition & 1 deletion eng/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ stages:
-configuration $(_BuildConfig)
$(_OfficialBuildIdArgs)
/p:Coverage=$(_Coverage)
/bl:$(BUILD.SOURCESDIRECTORY)\artifacts\log\$(_BuildConfig)\BuildSrc.binlog
/bl:$(BUILD.SOURCESDIRECTORY)\artifacts\log\$(_BuildConfig)\BuildSrc.binlog
displayName: Build

# Run Unit Tests
Expand Down
61 changes: 49 additions & 12 deletions eng/custom-cibuild.ps1
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
[CmdletBinding(PositionalBinding=$false)]
Param(
[string][Alias('c')]$configuration = "Debug",
[Parameter(ValueFromRemainingArguments=$true)][String[]]$properties
)

# How long to wait before we consider a build/test run to be unresponsive
$WaitSeconds = 900 # 15 min

. $PSScriptRoot\common\tools.ps1

# CI mode - set to true on CI server for PR validation build or official build.
Expand All @@ -13,39 +17,72 @@ Param(
[bool]$integrationTest = $properties.Contains('-integrationTest')
[bool]$performanceTest = $properties.Contains('-performanceTest')

$moduleLocation = Resolve-Path "$PSScriptRoot\Screenshots.win.psm1"
$initScreenshotsModule = [scriptblock]::Create("Import-Module $moduleLocation")
$screenshotsModuleLocation = Resolve-Path "$PSScriptRoot\Screenshots.win.psm1"
$initScreenshotsModule = [scriptblock]::Create("Import-Module $screenshotsModuleLocation")
$memDumpModuleLocation = Resolve-Path "$PSScriptRoot\MemDump.win.psm1"
$initMemDumpModule = [scriptblock]::Create("Import-Module $memDumpModuleLocation")

$TakeScreenshots = $ci -and ($test -or $integrationTest -or $performanceTest);
$CollectDebugInfo = $ci -and ($test -or $integrationTest -or $performanceTest);
$ImageLogs = '';
if ($TakeScreenshots) {
if ($CollectDebugInfo) {

dotnet tool install dotnet-dump --tool-path .\.tools

# Collect screenshots
# -----------------------------------------------------------
$ImageLogs = Join-Path $LogDir 'screenshots'
Create-Directory $ImageLogs

[ScriptBlock] $ScreenshotCaptureScript = {
param($ImageLogs)
Start-CaptureScreenshots "$ImageLogs"
param($ImageLogs, $WaitSeconds)
Start-CaptureScreenshots -TargetDir $ImageLogs -WaitSeconds $WaitSeconds
};

$job = Start-Job -InitializationScript $initScreenshotsModule `
-ScriptBlock $ScreenshotCaptureScript `
-ArgumentList $ImageLogs
-ArgumentList @( $ImageLogs, $WaitSeconds );

# Collect memory dump
# -----------------------------------------------------------
[ScriptBlock] $MemDumpScript = {
param($WorkingDir, $LogDir, $WaitSeconds)
Start-MemDumpTimer -WorkingDir $WorkingDir -TargetDir $LogDir -WaitSeconds $WaitSeconds
};
$job = Start-Job -InitializationScript $initMemDumpModule `
-ScriptBlock $MemDumpScript `
-ArgumentList @( $(Get-Location), $LogDir, $WaitSeconds )

}

try {

# Run the build script that does the actual work
powershell.exe -File $PSScriptRoot\common\Build.ps1 @properties
# -----------------------------------------------------------
powershell.exe -File $PSScriptRoot\common\Build.ps1 -c $configuration @properties
$exitCode = $LASTEXITCODE
Exit $exitCode;

}
finally {
if ($TakeScreenshots) {
if ($CollectDebugInfo) {

# Stop collecting screenshots
# -----------------------------------------------------------
[ScriptBlock] $ScreenshotCaptureScript = {
param($ImageLogs)
Stop-CaptureScreenshots -TargetDir $ImageLogs
};
Start-Job -InitializationScript $initScreenshotsModule `
-ScriptBlock $ScreenshotCaptureScript `
-ArgumentList $ImageLogs | Receive-Job -AutoRemoveJob -Wait
}
-ArgumentList $ImageLogs | Receive-Job -AutoRemoveJob -Wait;

# Stop collect memory dumps
# -----------------------------------------------------------
[ScriptBlock] $MemDumpScript = {
param($LogDir)
Stop-MemDumpTimer -TargetDir $LogDir
};
Start-Job -InitializationScript $initMemDumpModule `
-ScriptBlock $MemDumpScript `
-ArgumentList $LogDir | Receive-Job -AutoRemoveJob -Wait
}
}

0 comments on commit f88abd1

Please sign in to comment.