From b571f530f8d16e3489270fb681da311b44794324 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BE=90=E6=B6=9B?= Date: Sun, 7 Jun 2026 22:42:49 +0800 Subject: [PATCH] =?UTF-8?q?docs:=20=E6=96=B0=E5=A2=9E=E7=9F=A5=E8=AF=86?= =?UTF-8?q?=E5=9B=BE=E8=B0=B1=E4=B8=8E=E9=AB=98=E7=BA=A7=E6=A3=80=E7=B4=A2?= =?UTF-8?q?=E8=AE=BE=E8=AE=A1=E6=96=87=E6=A1=A3=EF=BC=88Phase=204=20?= =?UTF-8?q?=E5=A4=87=E7=94=A8=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/note-knowledge-graph-design.md | 266 ++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 docs/note-knowledge-graph-design.md diff --git a/docs/note-knowledge-graph-design.md b/docs/note-knowledge-graph-design.md new file mode 100644 index 0000000..e72a433 --- /dev/null +++ b/docs/note-knowledge-graph-design.md @@ -0,0 +1,266 @@ +# 知识图谱与高级检索设计(Phase 4 备用) + +> 本文记录 Phase 3 设计过程中裁剪的内容,待 Phase 4(Agent 运行时)制定时参考。 +> 来源:`docs/6-memory-system.md` v1 版本,2026-06-07 + +--- + +## 背景 + +Phase 3 记忆系统方案做减法后,以下设计被推迟到 Phase 4。这些组件需要 Agent 的编排能力(LLM 提取标签、自动维护知识图谱、智能检索策略)才能真正产生价值,因此不适合在 Phase 3 的存储层实现。 + +--- + +## 1. KnowledgeGraph(知识图谱) + +### 1.1 设计意图 + +实体-关系图存储,用于关联检索。与 KnowledgeStore(内容/页面级)互补,提供实体级 + 关系维度的检索能力。 + +``` +KnowledgeStore: 页面级内容("什么是 X") +KnowledgeGraph: 实体级关系("X 与什么相关") +``` + +### 1.2 接口设计(原方案) + +```rust +pub struct GraphEntity { + pub id: String, + pub name: String, + pub entity_type: String, // "person" | "concept" | "project" | ... + pub description: String, + pub tags: Vec, // 检索标签(全小写,原子词) +} + +pub struct GraphRelation { + pub source_id: String, + pub target_id: String, + pub relation_type: String, // "works_on" | "part_of" | "related_to" | ... + pub weight: f32, // 关系强度 [0.0, 1.0] +} + +pub enum RelationDirection { + Outgoing, // source_id -> target_id(默认) + Incoming, // target_id -> source_id + Both, // 双向遍历 +} + +pub struct ScoredEntity { + pub entity: GraphEntity, + pub score: f32, // 基于图距离的评分 [0.0, 1.0] +} + +#[async_trait] +pub trait KnowledgeGraph: Send + Sync { + // 实体管理 + async fn add_entity(&self, entity: GraphEntity) -> Result<(), MemoryError>; + async fn get_entity(&self, id: &str) -> Result, MemoryError>; + async fn remove_entity(&self, id: &str) -> Result<(), MemoryError>; + + // 关系管理 + async fn add_relation(&self, relation: GraphRelation) -> Result<(), MemoryError>; + async fn remove_relation(&self, source_id: &str, target_id: &str, relation_type: &str) -> Result<(), MemoryError>; + async fn get_related( + &self, + entity_id: &str, + depth: usize, + direction: RelationDirection, + relation_types: Option<&[&str]>, + ) -> Result, MemoryError>; + + // 检索 + async fn find_by_keywords(&self, keywords: &[String]) -> Result, MemoryError>; + + // 标签管理 + async fn find_tags(&self, prefix: &str) -> Result, MemoryError>; + async fn entity_count_by_tag(&self, tag: &str) -> Result; + async fn set_entity_tags(&self, entity_id: &str, tags: Vec) -> Result; + fn tag_constraints(&self) -> TagConstraints; +} + +pub struct TagConstraints { + pub max_tags_per_entity: usize, // 默认 8 +} +``` + +### 1.3 标签复用原则 + +标签不应随意增长,应优先复用已有标签。流程: + +``` +LLM 提取候选标签 → 对每个候选: + graph.find_tags(candidate.lowercase()) + ├─ 命中已有标签 → 复用 + └─ 无匹配 → 注册新标签 +``` + +### 1.4 标签容量与精炼 + +每个实体最多 `max_tags_per_entity`(默认 8)个标签,按关联度降序排列。超出上限时保留关联度最高的标签。 + +### 1.5 InMemoryGraph 实现 + +```rust +pub struct InMemoryGraph { + entities: Mutex>, + relations: Mutex>, + tag_index: Mutex>>, // tag → entity_ids +} +``` + +图遍历使用 BFS/DFS 算法,需用 `HashSet` 防环。 + +--- + +## 2. 高级评分策略 + +### 2.1 ScoringStrategy + +Phase 3 仅使用内部的简单 TextOverlap 评分(Dice 系数)。Phase 4 可引入以下策略: + +```rust +pub struct ScoreWeights { + pub overlap: f32, // 默认 0.5 — 文本重叠度,以原始 query 为基准 + pub graph: f32, // 默认 0.2 — 图距离 + pub temporal: f32, // 默认 0.1 — 时间衰减 + pub reference: f32, // 默认 0.2 — 引用计数 +} + +pub enum ScoringStrategy { + TextOverlap, // 以原始 query 为准绳的文本重叠度(默认) + GraphDistance, + TemporalWeight, + ReferenceCount, + Hybrid(ScoreWeights), +} + +pub struct ScoreBreakdown { + pub overlap_score: f32, + pub graph_score: f32, + pub temporal_score: f32, + pub reference_score: f32, +} +``` + +### 2.2 TextOverlap 算法 + +基于 Dice 系数计算 query 与召回内容的文本重叠度: + +``` +Dice = 2 × |intersect(bigrams)| / (|bigrams_query| + |bigrams_content|) +``` + +标题权重大于摘要,摘要权重大于正文。 + +--- + +## 3. 高级检索(MemoryRetriever 双通道版) + +Phase 3 仅保留单通道(只搜 KnowledgeStore)。Phase 4 可恢复双通道: + +```rust +pub struct MemoryRetriever { + knowledge_store: KnowledgeStore, + knowledge_graph: Arc, + keyword_extractor: Arc, + config: RetrieverConfig, +} + +pub enum RetrievalStrategy { + Hybrid, // 结合所有通道 + 评分排序(默认) + KnowledgeOnly, // 仅 KnowledgeStore + GraphOnly, // 仅 KnowledgeGraph +} +``` + +检索流程: + +``` +1. 关键词提取(KeywordExtractor) +2. 并行召回: + - KnowledgeStore.find_by_keywords(keywords) + - KnowledgeGraph.find_by_keywords(keywords) → get_related() 图遍历 +3. 逐条评分(ScoringStrategy) +4. 过滤 score < min_score +5. 排序 → 截取 top-N +``` + +--- + +## 4. KeywordExtractor + +```rust +pub trait KeywordExtractor: Send + Sync { + fn extract(&self, query: &str) -> Vec; +} + +pub struct SimpleKeywordExtractor { + stop_words: HashSet, +} +``` + +默认实现:按非字母数字字符分割,过滤停用词和单字符词。停用词表应包含英语常用停用词(约 80-100 个)。 + +--- + +## 5. 基于召回价值的淘汰(RecallBased) + +### 5.1 记忆价值评分 + +每条记忆维护召回统计,计算综合价值分数: + +```rust +pub struct RecallStats { + pub recall_count: u64, // 累计召回次数 + pub total_score: f64, // 累计评分(平均分 = total_score / recall_count) + pub last_recall_at: i64, // 最后一次被召回的时间戳(秒) +} + +// 记忆价值公式: +// value = ln(1 + recall_count) × w_recall + avg_score × w_score + recency × w_recency +``` + +### 5.2 record_recall() + +```rust +// MemoryStore trait 可选方法 +async fn record_recall(&self, id: &str, score: f32) -> Result<(), MemoryError> { + Ok(()) // 默认空实现,需覆盖 +} +``` + +### 5.3 淘汰策略 + +```rust +pub enum EvictionPolicy { + // ...Phase 3 已有: None, Ttl, Capacity... + + RecallBased { + max_items: usize, + recall_weight: f32, // 默认 0.3 + score_weight: f32, // 默认 0.5 + recency_weight: f32, // 默认 0.2 + }, + Hybrid { + ttl_secs: Option, + max_items: Option, + }, +} +``` + +--- + +## 6. Phase 3 → Phase 4 迁移建议 + +| 组件 | Phase 3 状态 | Phase 4 迁移方式 | +|------|-------------|-----------------| +| KnowledgeStore | 具体 struct(基于 MemoryStore) | 保持 struct,新增知识图谱数据入口 | +| KnowledgeGraph | 不存在 | 新建 `memory/graph.rs`,实现 trait + InMemoryGraph | +| MemoryRetriever | 单通道(仅 KnowledgeStore) | 增加 KnowledgeGraph 通道,恢复双通道检索 | +| ScoringStrategy | 内部 TextOverlap | 恢复枚举策略 + MemoryRetriever 配置 | +| KeywordExtractor | MemoryRetriever 内部拆分逻辑 | 抽取为独立 struct | +| RecallBased 淘汰 | 不存在 | 恢复 EvictionPolicy 变体 + RecallStats | +| 标签管理 | 不存在 | 恢复 tag_index + find_tags + set_entity_tags | + +**关键依赖**:Phase 4 的 Agent 编排是 KnowledgeGraph 和标签管理的驱动者。如果没有 Agent 的 LLM 调用来提取标签、维护知识,KnowledgeGraph 只是一个空的图存储。