Skip to content

Commit

Permalink
Make jl_cumulative_compile_time_ns global (and reentrant).
Browse files Browse the repository at this point in the history
Now, multiple tasks (on the same or different Threads) can start and stop compilation
time measurement, without interrupting each other.

Add `cumulative_compile_time_ns()` to return the cumulative compile time for a thread without stopping measurement.

```julia
julia> Base.cumulative_compile_time_ns_before()  # enable constantly measuring compilation time
0x0000000000000000

julia> @time @eval module M ; f(x) = 2+x; end
  0.006730 seconds (919 allocations: 55.212 KiB, 57.20% compilation time)
Main.M

julia> Base.cumulative_compile_time_ns()
0x00000000075246b3

julia> @time 2+2
  0.000000 seconds
4

julia> Base.cumulative_compile_time_ns()
0x0000000007fe4a46

julia> @time @eval M.f(2)
  0.003538 seconds (750 allocations: 46.247 KiB, 94.64% compilation time)
4

julia> Base.cumulative_compile_time_ns()
0x000000000831619e
```

Make jl_cumulative_compile_time_ns into a global, atomic variable.

Instead of keeping per-task compilation time, this change keeps a
global counter of compilation time, protected with atomic mutations.

Fixes #41739

```julia
julia> include("./compilation-task-migration-17-example.jl")
start thread: 2
end thread: 2
  5.185706 seconds (3.53 M allocations: 2.570 GiB, 7.34% gc time, 15.57% compilation time)

julia> include("./compilation-task-migration-17-example.jl")
start thread: 3
WARNING: replacing module M.
end thread: 1
  4.110316 seconds (18.23 k allocations: 2.391 GiB, 5.67% gc time, 0.24% compilation time)
```
  • Loading branch information
NHDaly committed Aug 1, 2021
1 parent bdacfa2 commit 5567f5f
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 42 deletions.
4 changes: 3 additions & 1 deletion base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,11 @@ function gc_alloc_count(diff::GC_Diff)
diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc
end

# cumulative total time spent on compilation
# cumulative total time spent on compilation, in nanoseconds
cumulative_compile_time_ns_before() = ccall(:jl_cumulative_compile_time_ns_before, UInt64, ())
cumulative_compile_time_ns_after() = ccall(:jl_cumulative_compile_time_ns_after, UInt64, ())
# cumulative total time the process has spent on compilation while measurement was enabled.
cumulative_compile_time_ns() = ccall(:jl_cumulative_compile_time_ns, UInt64, ())

# total time spend in garbage collection, in nanoseconds
gc_time_ns() = ccall(:jl_gc_total_hrtime, UInt64, ())
Expand Down
14 changes: 6 additions & 8 deletions src/aotcompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,7 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
JL_GC_PUSH1(&src);
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
if (jl_atomic_load(&jl_measure_compile_time))
compiler_start_time = jl_hrtime();

CompilationPolicy policy = (CompilationPolicy) _policy;
Expand Down Expand Up @@ -415,8 +414,8 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
}

data->M = std::move(clone);
if (jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_measure_compile_time, (jl_hrtime() - compiler_start_time));
if (policy == CompilationPolicy::ImagingMode)
imaging_mode = 0;
JL_UNLOCK(&codegen_lock); // Might GC
Expand Down Expand Up @@ -916,8 +915,7 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
jl_llvm_functions_t decls;
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
if (jl_atomic_load(&jl_measure_compile_time))
compiler_start_time = jl_hrtime();
std::tie(m, decls) = jl_emit_code(mi, src, jlrettype, output);

Expand All @@ -942,8 +940,8 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
m.release(); // the return object `llvmf` will be the owning pointer
}
JL_GC_POP();
if (jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_measure_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock); // Might GC
if (F)
return F;
Expand Down
7 changes: 3 additions & 4 deletions src/gf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3164,15 +3164,14 @@ static uint64_t inference_start_time = 0;
JL_DLLEXPORT void jl_typeinf_begin(void)
{
JL_LOCK(&typeinf_lock);
if (jl_measure_compile_time[jl_threadid()])
if (jl_atomic_load(&jl_measure_compile_time))
inference_start_time = jl_hrtime();
}

JL_DLLEXPORT void jl_typeinf_end(void)
{
int tid = jl_threadid();
if (typeinf_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - inference_start_time);
if (typeinf_lock.count == 1 && jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_measure_compile_time, (jl_hrtime() - inference_start_time));
JL_UNLOCK(&typeinf_lock);
}

Expand Down
45 changes: 23 additions & 22 deletions src/jitlayers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,21 @@ void jl_jit_globals(std::map<void *, GlobalVariable*> &globals)
extern "C" JL_DLLEXPORT
uint64_t jl_cumulative_compile_time_ns_before()
{
int tid = jl_threadid();
jl_measure_compile_time[tid] = 1;
return jl_cumulative_compile_time[tid];
// Increment the flag to allow reentrant callers to `@time`.
jl_atomic_fetch_add(&jl_measure_compile_time, 1);
return jl_atomic_load(&jl_cumulative_compile_time);
}
extern "C" JL_DLLEXPORT
uint64_t jl_cumulative_compile_time_ns_after()
{
int tid = jl_threadid();
jl_measure_compile_time[tid] = 0;
return jl_cumulative_compile_time[tid];
// Decrement the flag when done measuring, allowing other callers to continue measuring.
jl_atomic_fetch_add(&jl_measure_compile_time, -1);
return jl_atomic_load(&jl_cumulative_compile_time);
}

extern "C" JL_DLLEXPORT
uint64_t jl_cumulative_compile_time_ns() {
return jl_atomic_load(&jl_cumulative_compile_time);
}

// this generates llvm code for the lambda info
Expand Down Expand Up @@ -233,8 +238,7 @@ int jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt
{
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
if (jl_atomic_load(&jl_measure_compile_time))
compiler_start_time = jl_hrtime();
jl_codegen_params_t params;
jl_codegen_params_t *pparams = (jl_codegen_params_t*)p;
Expand All @@ -258,8 +262,8 @@ int jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt
if (success && llvmmod == NULL)
jl_add_to_ee(std::unique_ptr<Module>(into));
}
if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (codegen_lock.count == 1 && jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock);
return success;
}
Expand Down Expand Up @@ -315,8 +319,7 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT
{
JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
if (jl_atomic_load(&jl_measure_compile_time))
compiler_start_time = jl_hrtime();
// if we don't have any decls already, try to generate it now
jl_code_info_t *src = NULL;
Expand Down Expand Up @@ -354,8 +357,8 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT
else {
codeinst = NULL;
}
if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (codegen_lock.count == 1 && jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock);
JL_GC_POP();
return codeinst;
Expand All @@ -369,8 +372,7 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec)
}
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
if (jl_atomic_load(&jl_measure_compile_time))
compiler_start_time = jl_hrtime();
if (unspec->invoke == NULL) {
jl_code_info_t *src = NULL;
Expand Down Expand Up @@ -398,8 +400,8 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec)
}
JL_GC_POP();
}
if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (codegen_lock.count == 1 && jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock); // Might GC
}

Expand All @@ -422,8 +424,7 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world,
// so create an exception here so we can print pretty our lies
JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
if (jl_atomic_load(&jl_measure_compile_time))
compiler_start_time = jl_hrtime();
specfptr = (uintptr_t)codeinst->specptr.fptr;
if (specfptr == 0) {
Expand All @@ -448,8 +449,8 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world,
}
JL_GC_POP();
}
if (jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (jl_atomic_load(&jl_measure_compile_time))
jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock);
}
if (specfptr != 0)
Expand Down
5 changes: 3 additions & 2 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ static inline uint64_t cycleclock(void)

#include "timing.h"

extern uint8_t *jl_measure_compile_time;
extern uint64_t *jl_cumulative_compile_time;
// Global *atomic* integers controlling *process-wide* measurement of compilation time.
extern volatile uint8_t jl_measure_compile_time;
extern volatile uint64_t jl_cumulative_compile_time;

#ifdef _COMPILER_MICROSOFT_
# define jl_return_address() ((uintptr_t)_ReturnAddress())
Expand Down
2 changes: 1 addition & 1 deletion src/task.c
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ static void JL_NORETURN throw_internal(jl_task_t *ct, jl_value_t *exception JL_M
ptls->io_wait = 0;
// @time needs its compile timer disabled on error,
// and cannot use a try-finally as it would break scope for assignments
jl_measure_compile_time[ptls->tid] = 0;
jl_atomic_fetch_add(&jl_measure_compile_time, -1);
JL_GC_PUSH1(&exception);
jl_gc_unsafe_enter(ptls);
if (exception) {
Expand Down
6 changes: 2 additions & 4 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k)
#endif

jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED;
uint8_t *jl_measure_compile_time = NULL;
uint64_t *jl_cumulative_compile_time = NULL;
volatile uint8_t jl_measure_compile_time = 0;
volatile uint64_t jl_cumulative_compile_time = 0;

// return calling thread's ID
// Also update the suspended_threads list in signals-mach when changing the
Expand Down Expand Up @@ -467,8 +467,6 @@ void jl_init_threading(void)
}
if (jl_n_threads <= 0)
jl_n_threads = 1;
jl_measure_compile_time = (uint8_t*)calloc(jl_n_threads, sizeof(*jl_measure_compile_time));
jl_cumulative_compile_time = (uint64_t*)calloc(jl_n_threads, sizeof(*jl_cumulative_compile_time));
#ifndef __clang_analyzer__
jl_all_tls_states = (jl_ptls_t*)calloc(jl_n_threads, sizeof(void*));
#endif
Expand Down
17 changes: 17 additions & 0 deletions test/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,23 @@ function timev_macro_scope()
end
@test timev_macro_scope() == 1

before = Base.cumulative_compile_time_ns_before();

# exercise concurrent calls to `@time` for reentrant compilation time measurement.
t1 = @async @time begin
sleep(2)
@eval module M ; f(x,y) = x+y ; end
@eval M.f(2,3)
end
t2 = @async begin
sleep(1)
@time 2 + 2
end

after = Base.cumulative_compile_time_ns_after();
@test after >= before;
@test Base.cumulative_compile_time_ns() >= after;

# interactive utilities

struct ambigconvert; end # inject a problematic `convert` method to ensure it still works
Expand Down

0 comments on commit 5567f5f

Please sign in to comment.