Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PPU LLVM: Fix memory leaks and protect against the rise of CPU threads in the coming years #15377

Merged
merged 4 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 110 additions & 18 deletions rpcs3/Emu/Cell/PPUThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ bool serialize<ppu_thread::cr_bits>(utils::serial& ar, typename ppu_thread::cr_b
}

extern void ppu_initialize();
extern void ppu_finalize(const ppu_module& info);
extern void ppu_finalize(const ppu_module& info, bool force_mem_release = false);
extern bool ppu_initialize(const ppu_module& info, bool check_only = false, u64 file_size = 0);
static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module);
extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr);
Expand Down Expand Up @@ -3594,6 +3594,15 @@ namespace
if (found == bucket.map.end()) [[unlikely]]
{
ppu_log.error("Failed to remove module %s", name);

for (auto& buck : buckets)
{
for (auto& mod : buck.map)
{
ppu_log.notice("But there is module %s", mod.first);
}
}

return;
}

Expand Down Expand Up @@ -3706,26 +3715,41 @@ extern fs::file make_file_view(fs::file&& _file, u64 offset, u64 max_size = umax
return file;
}

extern void ppu_finalize(const ppu_module& info)
extern void ppu_finalize(const ppu_module& info, bool force_mem_release)
{
if (info.name.empty())
if (!force_mem_release && info.name.empty())
{
// Don't remove main module from memory
return;
}

const std::string dev_flash = vfs::get("/dev_flash/sys/");
if (!force_mem_release && Emu.GetCat() == "1P")
{
return;
}

const bool may_be_elf = fmt::to_lower(info.path.substr(std::max<usz>(info.path.size(), 3) - 3)) != "prx";

if (info.path.starts_with(dev_flash) || Emu.GetCat() == "1P")
const std::string dev_flash = vfs::get("/dev_flash/");

if (!may_be_elf)
{
if (!force_mem_release && info.path.starts_with(dev_flash + "sys/external/"))
{
// Don't remove dev_flash prx from memory
return;
}
}

if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm)
{
// Don't remove dev_flash prx from memory
return;
}

// Get cache path for this executable
std::string cache_path = fs::get_cache_dir() + "cache/";

if (!Emu.GetTitleID().empty())
if (!info.path.starts_with(dev_flash) && !Emu.GetTitleID().empty() && Emu.GetCat() != "1P")
{
cache_path += Emu.GetTitleID();
cache_path += '/';
Expand Down Expand Up @@ -3958,16 +3982,26 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_

lf_queue<file_info> possible_exec_file_paths;

::semaphore<2> ovl_sema;
// Allow to allocate 2000 times the size of each file for the use of LLVM
// This works very nicely with Metal Gear Solid 4 for example:
// 2 7MB overlay files -> 14GB
// The growth in memory requirements of LLVM is not linear with file size of course
// But these estimates should hopefully protect RPCS3 in the coming years
// Especially when thread count is on the rise with each CPU generation
atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax}));

const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue));
const u32 cpu_thread_limit = utils::get_thread_count() > 8u ? std::max<u32>(utils::get_thread_count(), 2) - 1 : utils::get_thread_count(); // One LLVM thread less

named_thread_group workers("SPRX Worker ", std::min<u32>(utils::get_thread_count(), ::size32(file_queue)), [&]
named_thread_group workers("SPRX Worker ", std::min<u32>(software_thread_limit, cpu_thread_limit), [&]
{
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
// Set low priority
thread_ctrl::scoped_priority low_prio(-1);
u32 inc_fdone = 1;
u32 restore_mem = 0;

for (usz func_i = fnext++; func_i < file_queue.size(); func_i = fnext++, g_progr_fdone += std::exchange(inc_fdone, 1))
{
Expand All @@ -3976,6 +4010,16 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
continue;
}

if (restore_mem)
{
if (!file_size_limit.fetch_add(restore_mem))
{
file_size_limit.notify_all();
}

restore_mem = 0;
}

auto& [path, offset, file_size] = file_queue[func_i];

ppu_log.notice("Trying to load: %s", path);
Expand Down Expand Up @@ -4007,15 +4051,53 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
continue;
}

auto wait_for_memory = [&]() -> bool
{
// Try not to process too many files at once because it seems to reduce performance and cause RAM shortages
// Concurrently compiling more OVL or huge PRX files does not have much theoretical benefit
while (!file_size_limit.fetch_op([&](u32& value)
{
if (value)
{
// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency
const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, file_size));
restore_mem = value - new_val;
value = new_val;
return true;
}

// Resort to waiting
restore_mem = 0;
return false;
}).second)
{
// Wait until not 0
file_size_limit.wait(0);
}

if (Emu.IsStopped())
{
return false;
}

return true;
};

elf_error prx_err{}, ovl_err{};

if (ppu_prx_object obj = src; (prx_err = obj, obj == elf_error::ok))
{
if (!wait_for_memory())
{
// Emulation stopped
continue;
}

if (auto prx = ppu_load_prx(obj, true, path, offset))
{
obj.clear(), src.close(); // Clear decrypted file and elf object memory
ppu_initialize(*prx, false, file_size);
ppu_finalize(*prx);
ppu_finalize(*prx, true);
continue;
}

Expand All @@ -4027,10 +4109,6 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
{
while (ovl_err == elf_error::ok)
{
// Try not to process too many files at once because it seems to reduce performance
// Concurrently compiling more OVL files does not have much theoretical benefit
std::lock_guard lock(ovl_sema);

if (Emu.IsStopped())
{
break;
Expand All @@ -4051,6 +4129,12 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
break;
}

if (!wait_for_memory())
{
// Emulation stopped
break;
}

// Participate in thread execution limitation (takes a long time)
if (std::lock_guard lock(g_fxo->get<jit_core_allocator>().sem); !ovlm->analyse(0, ovlm->entry, ovlm->seg0_code_end, ovlm->applied_patches, []()
{
Expand All @@ -4063,7 +4147,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_

obj.clear(), src.close(); // Clear decrypted file and elf object memory
ppu_initialize(*ovlm, false, file_size);
ppu_finalize(*ovlm);
ppu_finalize(*ovlm, true);
break;
}

Expand All @@ -4073,10 +4157,18 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
}
}

ppu_log.notice("Failed to precompile '%s' (prx: %s, ovl: %s): Attempting tratment as executable file", path, prx_err, ovl_err);
ppu_log.notice("Failed to precompile '%s' (prx: %s, ovl: %s): Attempting compilation as executable file", path, prx_err, ovl_err);
possible_exec_file_paths.push(path, offset, file_size);
inc_fdone = 0;
}

if (restore_mem)
{
if (!file_size_limit.fetch_add(restore_mem))
{
file_size_limit.notify_all();
}
}
});

// Join every thread
Expand Down Expand Up @@ -4164,7 +4256,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
Emu.ConfigurePPUCache(!Emu.IsPathInsideDir(_main.path, g_cfg_vfs.get_dev_flash()));
ppu_initialize(_main, false, file_size);
spu_cache::initialize(false);
ppu_finalize(_main);
ppu_finalize(_main, true);
_main = {};
g_fxo->get<spu_cache>() = std::move(current_cache);
break;
Expand Down Expand Up @@ -4288,7 +4380,7 @@ extern void ppu_initialize()
}
}

const std::string firmware_sprx_path = vfs::get(dev_flash_located ? "/dev_flash/"sv : "/dev_flash/sys/"sv);
const std::string firmware_sprx_path = vfs::get(dev_flash_located ? "/dev_flash/"sv : "/dev_flash/sys/external/"sv);
dir_queue.emplace_back(firmware_sprx_path);
}

Expand Down
25 changes: 22 additions & 3 deletions rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4495,11 +4495,10 @@ struct spu_llvm_worker
void operator()()
{
// SPU LLVM Recompiler instance
const auto compiler = spu_recompiler_base::make_llvm_recompiler();
compiler->init();
std::unique_ptr<spu_recompiler_base> compiler;

// Fake LS
std::vector<be_t<u32>> ls(0x10000);
std::vector<be_t<u32>> ls;

bool set_relax_flag = false;

Expand Down Expand Up @@ -4542,6 +4541,15 @@ struct spu_llvm_worker
break;
}

if (!compiler)
{
// Postponed initialization
compiler = spu_recompiler_base::make_llvm_recompiler();
compiler->init();

ls.resize(SPU_LS_SIZE / sizeof(be_t<u32>));
}

if (!set_relax_flag)
{
spu_thread::g_spu_work_count++;
Expand Down Expand Up @@ -4624,6 +4632,17 @@ struct spu_llvm
return;
}

while (!registered && thread_ctrl::state() != thread_state::aborting)
{
// Wait for the first SPU block before launching any thread
thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
}

if (thread_ctrl::state() == thread_state::aborting)
{
return;
}

// To compile (hash -> item)
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;

Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/Cell/lv2/sys_overlay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
extern std::pair<std::shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* ar = nullptr);

extern bool ppu_initialize(const ppu_module&, bool check_only = false, u64 file_size = 0);
extern void ppu_finalize(const ppu_module&);
extern void ppu_finalize(const ppu_module& info, bool force_mem_release = false);

LOG_CHANNEL(sys_overlay);

Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/Cell/lv2/sys_prx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
extern std::shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object&, bool virtual_load, const std::string&, s64, utils::serial* = nullptr);
extern void ppu_unload_prx(const lv2_prx& prx);
extern bool ppu_initialize(const ppu_module&, bool check_only = false, u64 file_size = 0);
extern void ppu_finalize(const ppu_module&);
extern void ppu_finalize(const ppu_module& info, bool force_mem_release = false);
extern void ppu_manual_load_imports_exports(u32 imports_start, u32 imports_size, u32 exports_start, u32 exports_size, std::basic_string<bool>& loaded_flags);

LOG_CHANNEL(sys_prx);
Expand Down