diff --git a/examples/llama.vim b/examples/llama.vim new file mode 100644 index 0000000000000..e75872cae0e9c --- /dev/null +++ b/examples/llama.vim @@ -0,0 +1,706 @@ +" LLM-based text completion using llama.cpp +" +" requires: +" +" - neovim +" - curl +" - llama.cpp server instance +" - FIM-compatible model +" +" sample config: +" +" - Tab - accept the current suggestion +" - Shift+Tab - accept just the first line of the segguestion +" - Ctrl+F - toggle FIM completion manually +" +" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim +" +" start the llama.cpp server with a FIM-compatible model. for example: +" +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 +" +" --batch-size [512, model max context] +" +" adjust the batch size to control how much of the provided local context will be used during the inference +" lower values will use smaller part of the context around the cursor, which will result in faster processing +" +" --ubatch-size [64, 2048] +" +" chunks the batch into smaller chunks for faster processing +" depends on the specific hardware. use llama-bench to profile and determine the best size +" +" --cache-reuse (ge:llama_config.n_predict, 1024] +" +" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict +" using non-zero value enables context reuse on the server side which dramatically improves the performance at +" large contexts. a value of 256 should be good for all cases +" +" run this once to initialise llama.vim: +" +" :call llama#init() +" +" more info: https://github.com/ggerganov/llama.cpp/pull/9787 +" + +" colors (adjust to your liking) +highlight llama_hl_hint guifg=#ff772f +highlight llama_hl_info guifg=#77ff2f + +" general parameters: +" +" endpoint: llama.cpp server endpoint +" n_prefix: number of lines before the cursor location to include in the local prefix +" n_suffix: number of lines after the cursor location to include in the local suffix +" n_predict: max number of tokens to predict +" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) +" t_max_predict_ms: max alloted time for the prediction +" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) +" auto_fim: trigger FIM completion automatically on cursor movement +" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor +" +" ring buffer of chunks, accumulated with time upon: +" +" - completion request +" - yank +" - entering a buffer +" - leaving a buffer +" - writing a file +" +" parameters for the ring-buffer with extra context: +" +" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) +" ring_chunk_size: max size of the chunks (in number of lines) +" note: adjust these numbers so that you don't overrun your context +" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context +" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM +" ring_update_ms: how often to process queued chunks in normal mode +" +let s:default_config = { + \ 'endpoint': 'http://127.0.0.1:8012/infill', + \ 'n_prefix': 256, + \ 'n_suffix': 64, + \ 'n_predict': 128, + \ 't_max_prompt_ms': 500, + \ 't_max_predict_ms': 1000, + \ 'show_info': 2, + \ 'auto_fim': v:true, + \ 'max_line_suffix': 8, + \ 'ring_n_chunks': 64, + \ 'ring_chunk_size': 64, + \ 'ring_scope': 1024, + \ 'ring_update_ms': 1000, + \ } + +let g:llama_config = get(g:, 'llama_config', s:default_config) + +function! s:rand(i0, i1) abort + return a:i0 + rand() % (a:i1 - a:i0 + 1) +endfunction + +function! llama#init() + if !executable('curl') + echohl WarningMsg + echo 'llama.vim requires the "curl" command to be available' + echohl None + return + endif + + let s:pos_x = 0 " cursor position upon start of completion + let s:pos_y = 0 + + let s:line_cur = '' + + let s:line_cur_prefix = '' + let s:line_cur_suffix = '' + + let s:ring_chunks = [] " current set of chunks used as extra context + let s:ring_queued = [] " chunks that are queued to be sent for processing + let s:ring_n_evict = 0 + + let s:hint_shown = v:false + let s:pos_y_pick = -9999 " last y where we picked a chunk + let s:pos_dx = 0 + let s:content = [] + let s:can_accept = v:false + + let s:timer_fim = -1 + let s:t_fim_start = reltime() " used to measure total FIM time + let s:t_last_move = reltime() " last time the cursor moved + + let s:current_job = v:null + + augroup llama + autocmd! + autocmd InsertEnter * inoremap llama#fim_inline(v:false) + autocmd InsertLeavePre * call llama#fim_cancel() + + autocmd CursorMoved * call s:on_move() + autocmd CursorMovedI * call s:on_move() + autocmd CompleteChanged * call llama#fim_cancel() + + if g:llama_config.auto_fim + autocmd CursorMovedI * call llama#fim(v:true) + endif + + " gather chunks upon yanking + autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif + + " gather chunks upon entering/leaving a buffer + autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) + autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + + " gather chunk upon saving the file + autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + augroup END + + silent! call llama#fim_cancel() + + " init background update of the ring buffer + if g:llama_config.ring_n_chunks > 0 + call s:ring_update() + endif +endfunction + +" compute how similar two chunks of text are +" 0 - no similarity, 1 - high similarity +" TODO: figure out something better +function! s:chunk_sim(c0, c1) + let l:lines0 = len(a:c0) + let l:lines1 = len(a:c1) + + let l:common = 0 + + for l:line0 in a:c0 + for l:line1 in a:c1 + if l:line0 == l:line1 + let l:common += 1 + break + endif + endfor + endfor + + return 2.0 * l:common / (l:lines0 + l:lines1) +endfunction + +" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing +" +" no_mod - do not pick chunks from buffers with pending changes +" do_evict - evict chunks that are very similar to the new one +" +function! s:pick_chunk(text, no_mod, do_evict) + " do not pick chunks from buffers with pending changes or buffers that are not files + if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) + return + endif + + " if the extra context option is disabled - do nothing + if g:llama_config.ring_n_chunks <= 0 + return + endif + + " don't pick very small chunks + if len(a:text) < 3 + return + endif + + if len(a:text) + 1 < g:llama_config.ring_chunk_size + let l:chunk = a:text + else + let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2])) + let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)]) + + let l:chunk = a:text[l:l0:l:l1] + endif + + let l:chunk_str = join(l:chunk, "\n") . "\n" + + " check if this chunk is already added + let l:exist = v:false + + for i in range(len(s:ring_chunks)) + if s:ring_chunks[i].data == l:chunk + let l:exist = v:true + break + endif + endfor + + for i in range(len(s:ring_queued)) + if s:ring_queued[i].data == l:chunk + let l:exist = v:true + break + endif + endfor + + if l:exist + return + endif + + " evict queued chunks that are very similar to the new one + for i in range(len(s:ring_queued) - 1, 0, -1) + if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9 + if a:do_evict + call remove(s:ring_queued, i) + let s:ring_n_evict += 1 + else + return + endif + endif + endfor + + " also from s:ring_chunks + for i in range(len(s:ring_chunks) - 1, 0, -1) + if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 + if a:do_evict + call remove(s:ring_chunks, i) + let s:ring_n_evict += 1 + else + return + endif + endif + endfor + + " TODO: become parameter ? + if len(s:ring_queued) == 16 + call remove(s:ring_queued, 0) + endif + + call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) + + "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) +endfunction + +" picks a queued chunk, sends it for processing and adds it to s:ring_chunks +" called every g:llama_config.ring_update_ms +function! s:ring_update() + call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) + + " update only if in normal mode or if the cursor hasn't moved for a while + if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0 + return + endif + + if len(s:ring_queued) == 0 + return + endif + + " move the first queued chunk to the ring buffer + if len(s:ring_chunks) == g:llama_config.ring_n_chunks + call remove(s:ring_chunks, 0) + endif + + call add(s:ring_chunks, remove(s:ring_queued, 0)) + + "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) + + " send asynchronous job with the new extra context so that it is ready for the next FIM + let l:extra_context = [] + for l:chunk in s:ring_chunks + call add(l:extra_context, { + \ 'text': l:chunk.str, + \ 'time': l:chunk.time, + \ 'filename': l:chunk.filename + \ }) + endfor + + " no samplers needed here + let l:request = json_encode({ + \ 'input_prefix': "", + \ 'input_suffix': "", + \ 'input_extra': l:extra_context, + \ 'prompt': "", + \ 'n_predict': 1, + \ 'temperature': 0.0, + \ 'stream': v:false, + \ 'samplers': ["temperature"], + \ 'cache_prompt': v:true, + \ 't_max_prompt_ms': 1, + \ 't_max_predict_ms': 1 + \ }) + + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + " no callbacks because we don't need to process the response + call jobstart(l:curl_command, {}) +endfunction + +" necessary for 'inoremap ' +function! llama#fim_inline(is_auto) abort + call llama#fim(a:is_auto) + return '' +endfunction + +" the main FIM call +" takes local context around the cursor and sends it together with the extra context to the server for completion +function! llama#fim(is_auto) abort + " we already have a suggestion for the current cursor position + if s:hint_shown && !a:is_auto + call llama#fim_cancel() + return + endif + + call llama#fim_cancel() + + " avoid sending repeated requests too fast + if reltimefloat(reltime(s:t_fim_start)) < 0.6 + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + + let s:t_fim_start = reltime() + let s:timer_fim = timer_start(600, {-> llama#fim(v:true)}) + return + endif + + let s:t_fim_start = reltime() + + let s:content = [] + let s:can_accept = v:false + + let s:pos_x = col('.') - 1 + let s:pos_y = line('.') + let l:max_y = line('$') + + let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1) + let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix])) + + let s:line_cur = getline('.') + + let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x) + let s:line_cur_suffix = strpart(s:line_cur, s:pos_x) + + if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix + return + endif + + let l:prefix = "" + \ . join(l:lines_prefix, "\n") + \ . "\n" + + let l:prompt = "" + \ . s:line_cur_prefix + + let l:suffix = "" + \ . s:line_cur_suffix + \ . "\n" + \ . join(l:lines_suffix, "\n") + \ . "\n" + + " prepare the extra context data + let l:extra_context = [] + for l:chunk in s:ring_chunks + call add(l:extra_context, { + \ 'text': l:chunk.str, + \ 'time': l:chunk.time, + \ 'filename': l:chunk.filename + \ }) + endfor + + " the indentation of the current line + let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) + + let l:request = json_encode({ + \ 'input_prefix': l:prefix, + \ 'input_suffix': l:suffix, + \ 'input_extra': l:extra_context, + \ 'prompt': l:prompt, + \ 'n_predict': g:llama_config.n_predict, + \ 'n_indent': l:indent, + \ 'top_k': 40, + \ 'top_p': 0.99, + \ 'stream': v:false, + \ 'samplers': ["top_k", "top_p", "infill"], + \ 'cache_prompt': v:true, + \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, + \ 't_max_predict_ms': g:llama_config.t_max_predict_ms + \ }) + + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + if s:current_job != v:null + call jobstop(s:current_job) + endif + + " send the request asynchronously + let s:current_job = jobstart(l:curl_command, { + \ 'on_stdout': function('s:fim_on_stdout'), + \ 'on_exit': function('s:fim_on_exit'), + \ 'stdout_buffered': v:true, + \ 'pos_x': s:pos_x, + \ 'pos_y': s:pos_y, + \ 'is_auto': a:is_auto + \ }) + + " TODO: per-file location + let l:delta_y = abs(s:pos_y - s:pos_y_pick) + + " gather some extra context nearby and process it in the background + " only gather chunks if the cursor has moved a lot + " TODO: something more clever? reranking? + if a:is_auto && l:delta_y > 32 + " expand the prefix even further + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) + + " pick a suffix chunk + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) + + let s:pos_y_pick = s:pos_y + endif +endfunction + +" if first_line == v:true accept only the first line of the response +function! llama#fim_accept(first_line) + " insert the suggestion at the cursor location + if s:can_accept && len(s:content) > 0 + call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0]) + if len(s:content) > 1 + if !a:first_line + call append(s:pos_y, s:content[1:-1]) + endif + endif + + " move the cursor to the end of the accepted text + if !a:first_line && len(s:content) > 1 + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1) + else + call cursor(s:pos_y, s:pos_x + len(s:content[0])) + endif + endif + + call llama#fim_cancel() +endfunction + +function! llama#fim_cancel() + let s:hint_shown = v:false + + " clear the virtual text + let l:bufnr = bufnr('%') + + let l:id_vt_fim = nvim_create_namespace('vt_fim') + let l:id_vt_info = nvim_create_namespace('vt_info') + + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) + + " remove the mappings + silent! iunmap + silent! iunmap + silent! iunmap +endfunction + +function! s:on_move() + let s:t_last_move = reltime() + + call llama#fim_cancel() +endfunction + +" callback that processes the FIM result from the server and displays the suggestion +function! s:fim_on_stdout(job_id, data, event) dict + let l:raw = join(a:data, "\n") + if len(l:raw) == 0 + return + endif + + if self.pos_x != col('.') - 1 || self.pos_y != line('.') + return + endif + + " show the suggestion only in insert mode + if mode() !=# 'i' + return + endif + + let s:pos_x = self.pos_x + let s:pos_y = self.pos_y + + let s:can_accept = v:true + let l:has_info = v:false + + if s:can_accept && v:shell_error + if !self.is_auto + call add(s:content, "<| curl error: is the server on? |>") + endif + let s:can_accept = v:false + endif + + let l:n_prompt = 0 + let l:t_prompt_ms = 1.0 + let l:s_prompt = 0 + + let l:n_predict = 0 + let l:t_predict_ms = 1.0 + let l:s_predict = 0 + + " get the generated suggestion + if s:can_accept + let l:response = json_decode(l:raw) + + for l:part in split(get(l:response, 'content', ''), "\n", 1) + call add(s:content, l:part) + endfor + + " remove trailing new lines + while len(s:content) > 0 && s:content[-1] == "" + call remove(s:content, -1) + endwhile + + let l:generation_settings = get(l:response, 'generation_settings', {}) + let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) + + let l:n_cached = get(l:response, 'tokens_cached', 0) + let l:truncated = get(l:response, 'truncated', v:false) + + " if response.timings is available + if len(get(l:response, 'timings', {})) > 0 + let l:has_info = v:true + let l:timings = get(l:response, 'timings', {}) + + let l:n_prompt = get(l:timings, 'prompt_n', 0) + let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) + let l:s_prompt = get(l:timings, 'prompt_per_second', 0) + + let l:n_predict = get(l:timings, 'predicted_n', 0) + let l:t_predict_ms = get(l:timings, 'predicted_ms', 1) + let l:s_predict = get(l:timings, 'predicted_per_second', 0) + endif + endif + + if len(s:content) == 0 + call add(s:content, "") + let s:can_accept = v:false + endif + + if len(s:content) == 0 + return + endif + + " NOTE: the following is logic for discarding predictions that repeat existing text + " the code is quite ugly and there is very likely a simpler and more canonical way to implement this + " + " still, I wonder if there is some better way that avoids having to do these special hacks? + " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would + " start generating whatever we have given it via the extra context. but on the other hand, it's not very + " helpful to re-generate the same code that is already there + + " truncate the suggestion if the first line is empty + if len(s:content) == 1 && s:content[0] == "" + let s:content = [""] + endif + + " ... and the next lines are repeated + if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1) + let s:content = [""] + endif + + " truncate the suggestion if it repeats the suffix + if len(s:content) == 1 && s:content[0] == s:line_cur_suffix + let s:content = [""] + endif + + " find the first non-empty line (strip whitespace) + let l:cmp_y = s:pos_y + 1 + while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$' + let l:cmp_y += 1 + endwhile + + if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y) + " truncate the suggestion if it repeats the next line + if len(s:content) == 1 + let s:content = [""] + endif + + " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1 + if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1] + let s:content = [""] + endif + + " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1) + if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n") + let s:content = [""] + endif + endif + + " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix + "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) + "for i in range(1, len(s:content) - 1) + " if strlen(matchstr(s:content[i], '^\s*')) < l:indent + " let s:content = s:content[:i - 1] + " break + " endif + "endfor + + let s:pos_dx = len(s:content[-1]) + + let s:content[-1] .= s:line_cur_suffix + + call llama#fim_cancel() + + " display virtual text with the suggestion + let l:bufnr = bufnr('%') + + let l:id_vt_fim = nvim_create_namespace('vt_fim') + let l:id_vt_info = nvim_create_namespace('vt_info') + + " construct the info message + if g:llama_config.show_info > 0 && l:has_info + " prefix the info string with whitespace in order to offset it to the right of the fim overlay + let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) + + if l:truncated + let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', + \ l:n_cached, l:n_ctx + \ ) + else + let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', + \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), + \ l:n_prompt, l:t_prompt_ms, l:s_prompt, + \ l:n_predict, l:t_predict_ms, l:s_predict, + \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) + \ ) + endif + + if g:llama_config.show_info == 1 + "" display it in the statusline + let &statusline = l:info + elseif g:llama_config.show_info == 2 + " display it to the right of the current line + call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { + \ 'virt_text': [[l:info, 'llama_hl_info']], + \ 'virt_text_pos': 'eol', + \ }) + endif + endif + + " display the suggestion + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { + \ 'virt_text': [[s:content[0], 'llama_hl_hint']], + \ 'virt_text_win_col': virtcol('.') - 1 + \ }) + + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, { + \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), + \ 'virt_text_win_col': virtcol('.') + \ }) + + " setup accept shortcuts + inoremap :call llama#fim_accept(v:false) + inoremap :call llama#fim_accept(v:true) + + let s:hint_shown = v:true +endfunction + +function! s:fim_on_exit(job_id, exit_code, event) dict + if a:exit_code != 0 + echom "Job failed with exit code: " . a:exit_code + endif + + let s:current_job = v:null +endfunction