Fix too long context issue. (#4735)

### What problem does this PR solve? #4728 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
infiniflow · Feb 6, 2025 · 2a07eb6 · 2a07eb6
1 parent a3a7043
commit 2a07eb6
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 3 deletions.
diff --git a/graphrag/general/community_reports_extractor.py b/graphrag/general/community_reports_extractor.py
@@ -70,6 +70,8 @@ def __call__(self, graph: nx.Graph, callback: Callable | None = None):
                 weight = ents["weight"]
                 ents = ents["nodes"]
                 ent_df = pd.DataFrame(self._get_entity_(ents)).dropna()#[{"entity": n, **graph.nodes[n]} for n in ents])
+                if ent_df.empty:
+                    continue
                 ent_df["entity"] = ent_df["entity_name"]
                 del ent_df["entity_name"]
                 rela_df = pd.DataFrame(self._get_relation_(list(ent_df["entity"]), list(ent_df["entity"]), 10000))

diff --git a/graphrag/general/extractor.py b/graphrag/general/extractor.py
@@ -99,6 +99,7 @@ def __call__(
         with ThreadPoolExecutor(max_workers=max_workers) as exe:
             threads = []
             for i, (cid, ck) in enumerate(chunks):
+                ck = truncate(ck, int(self._llm.max_length*0.8))
                 threads.append(
                     exe.submit(self._process_single_content, (cid, ck)))
 
@@ -241,5 +242,5 @@ def _handle_entity_relation_summary(
         )
         use_prompt = prompt_template.format(**context_base)
         logging.info(f"Trigger summary: {entity_or_relation_name}")
-        summary = self._chat(use_prompt, [{"role": "assistant", "content": "Output: "}], {"temperature": 0.8})
+        summary = self._chat(use_prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.8})
         return summary
diff --git a/graphrag/general/graph_extractor.py b/graphrag/general/graph_extractor.py
@@ -120,7 +120,7 @@ def _process_single_content(self,
             token_count += num_tokens_from_string(hint_prompt + response)
 
             results = response or ""
-            history = [{"role": "system", "content": hint_prompt}, {"role": "assistant", "content": response}]
+            history = [{"role": "system", "content": hint_prompt}, {"role": "user", "content": response}]
 
             # Repeat to ensure we maximize entity count
             for i in range(self._max_gleanings):

diff --git a/graphrag/light/graph_extractor.py b/graphrag/light/graph_extractor.py
@@ -91,7 +91,7 @@ def _process_single_content(self, chunk_key_dp: tuple[str, str]):
         ).format(**self._context_base, input_text=content)
 
         try:
-            gen_conf = {"temperature": 0.3}
+            gen_conf = {"temperature": 0.8}
             final_result = self._chat(hint_prompt, [{"role": "user", "content": "Output:"}], gen_conf)
             token_count += num_tokens_from_string(hint_prompt + final_result)
             history = pack_user_ass_to_openai_messages(hint_prompt, final_result)