docs: 新增知识图谱与高级检索设计文档(Phase 4 备用)
This commit is contained in:
@@ -0,0 +1,266 @@
|
|||||||
|
# 知识图谱与高级检索设计(Phase 4 备用)
|
||||||
|
|
||||||
|
> 本文记录 Phase 3 设计过程中裁剪的内容,待 Phase 4(Agent 运行时)制定时参考。
|
||||||
|
> 来源:`docs/6-memory-system.md` v1 版本,2026-06-07
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 背景
|
||||||
|
|
||||||
|
Phase 3 记忆系统方案做减法后,以下设计被推迟到 Phase 4。这些组件需要 Agent 的编排能力(LLM 提取标签、自动维护知识图谱、智能检索策略)才能真正产生价值,因此不适合在 Phase 3 的存储层实现。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. KnowledgeGraph(知识图谱)
|
||||||
|
|
||||||
|
### 1.1 设计意图
|
||||||
|
|
||||||
|
实体-关系图存储,用于关联检索。与 KnowledgeStore(内容/页面级)互补,提供实体级 + 关系维度的检索能力。
|
||||||
|
|
||||||
|
```
|
||||||
|
KnowledgeStore: 页面级内容("什么是 X")
|
||||||
|
KnowledgeGraph: 实体级关系("X 与什么相关")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.2 接口设计(原方案)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct GraphEntity {
|
||||||
|
pub id: String,
|
||||||
|
pub name: String,
|
||||||
|
pub entity_type: String, // "person" | "concept" | "project" | ...
|
||||||
|
pub description: String,
|
||||||
|
pub tags: Vec<String>, // 检索标签(全小写,原子词)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GraphRelation {
|
||||||
|
pub source_id: String,
|
||||||
|
pub target_id: String,
|
||||||
|
pub relation_type: String, // "works_on" | "part_of" | "related_to" | ...
|
||||||
|
pub weight: f32, // 关系强度 [0.0, 1.0]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum RelationDirection {
|
||||||
|
Outgoing, // source_id -> target_id(默认)
|
||||||
|
Incoming, // target_id -> source_id
|
||||||
|
Both, // 双向遍历
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ScoredEntity {
|
||||||
|
pub entity: GraphEntity,
|
||||||
|
pub score: f32, // 基于图距离的评分 [0.0, 1.0]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait KnowledgeGraph: Send + Sync {
|
||||||
|
// 实体管理
|
||||||
|
async fn add_entity(&self, entity: GraphEntity) -> Result<(), MemoryError>;
|
||||||
|
async fn get_entity(&self, id: &str) -> Result<Option<GraphEntity>, MemoryError>;
|
||||||
|
async fn remove_entity(&self, id: &str) -> Result<(), MemoryError>;
|
||||||
|
|
||||||
|
// 关系管理
|
||||||
|
async fn add_relation(&self, relation: GraphRelation) -> Result<(), MemoryError>;
|
||||||
|
async fn remove_relation(&self, source_id: &str, target_id: &str, relation_type: &str) -> Result<(), MemoryError>;
|
||||||
|
async fn get_related(
|
||||||
|
&self,
|
||||||
|
entity_id: &str,
|
||||||
|
depth: usize,
|
||||||
|
direction: RelationDirection,
|
||||||
|
relation_types: Option<&[&str]>,
|
||||||
|
) -> Result<Vec<ScoredEntity>, MemoryError>;
|
||||||
|
|
||||||
|
// 检索
|
||||||
|
async fn find_by_keywords(&self, keywords: &[String]) -> Result<Vec<GraphEntity>, MemoryError>;
|
||||||
|
|
||||||
|
// 标签管理
|
||||||
|
async fn find_tags(&self, prefix: &str) -> Result<Vec<String>, MemoryError>;
|
||||||
|
async fn entity_count_by_tag(&self, tag: &str) -> Result<usize, MemoryError>;
|
||||||
|
async fn set_entity_tags(&self, entity_id: &str, tags: Vec<String>) -> Result<usize, MemoryError>;
|
||||||
|
fn tag_constraints(&self) -> TagConstraints;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct TagConstraints {
|
||||||
|
pub max_tags_per_entity: usize, // 默认 8
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.3 标签复用原则
|
||||||
|
|
||||||
|
标签不应随意增长,应优先复用已有标签。流程:
|
||||||
|
|
||||||
|
```
|
||||||
|
LLM 提取候选标签 → 对每个候选:
|
||||||
|
graph.find_tags(candidate.lowercase())
|
||||||
|
├─ 命中已有标签 → 复用
|
||||||
|
└─ 无匹配 → 注册新标签
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.4 标签容量与精炼
|
||||||
|
|
||||||
|
每个实体最多 `max_tags_per_entity`(默认 8)个标签,按关联度降序排列。超出上限时保留关联度最高的标签。
|
||||||
|
|
||||||
|
### 1.5 InMemoryGraph 实现
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct InMemoryGraph {
|
||||||
|
entities: Mutex<HashMap<String, GraphEntity>>,
|
||||||
|
relations: Mutex<Vec<GraphRelation>>,
|
||||||
|
tag_index: Mutex<HashMap<String, HashSet<String>>>, // tag → entity_ids
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
图遍历使用 BFS/DFS 算法,需用 `HashSet<String>` 防环。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 高级评分策略
|
||||||
|
|
||||||
|
### 2.1 ScoringStrategy
|
||||||
|
|
||||||
|
Phase 3 仅使用内部的简单 TextOverlap 评分(Dice 系数)。Phase 4 可引入以下策略:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct ScoreWeights {
|
||||||
|
pub overlap: f32, // 默认 0.5 — 文本重叠度,以原始 query 为基准
|
||||||
|
pub graph: f32, // 默认 0.2 — 图距离
|
||||||
|
pub temporal: f32, // 默认 0.1 — 时间衰减
|
||||||
|
pub reference: f32, // 默认 0.2 — 引用计数
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ScoringStrategy {
|
||||||
|
TextOverlap, // 以原始 query 为准绳的文本重叠度(默认)
|
||||||
|
GraphDistance,
|
||||||
|
TemporalWeight,
|
||||||
|
ReferenceCount,
|
||||||
|
Hybrid(ScoreWeights),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ScoreBreakdown {
|
||||||
|
pub overlap_score: f32,
|
||||||
|
pub graph_score: f32,
|
||||||
|
pub temporal_score: f32,
|
||||||
|
pub reference_score: f32,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 TextOverlap 算法
|
||||||
|
|
||||||
|
基于 Dice 系数计算 query 与召回内容的文本重叠度:
|
||||||
|
|
||||||
|
```
|
||||||
|
Dice = 2 × |intersect(bigrams)| / (|bigrams_query| + |bigrams_content|)
|
||||||
|
```
|
||||||
|
|
||||||
|
标题权重大于摘要,摘要权重大于正文。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 高级检索(MemoryRetriever 双通道版)
|
||||||
|
|
||||||
|
Phase 3 仅保留单通道(只搜 KnowledgeStore)。Phase 4 可恢复双通道:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct MemoryRetriever {
|
||||||
|
knowledge_store: KnowledgeStore,
|
||||||
|
knowledge_graph: Arc<dyn KnowledgeGraph>,
|
||||||
|
keyword_extractor: Arc<dyn KeywordExtractor>,
|
||||||
|
config: RetrieverConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum RetrievalStrategy {
|
||||||
|
Hybrid, // 结合所有通道 + 评分排序(默认)
|
||||||
|
KnowledgeOnly, // 仅 KnowledgeStore
|
||||||
|
GraphOnly, // 仅 KnowledgeGraph
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
检索流程:
|
||||||
|
|
||||||
|
```
|
||||||
|
1. 关键词提取(KeywordExtractor)
|
||||||
|
2. 并行召回:
|
||||||
|
- KnowledgeStore.find_by_keywords(keywords)
|
||||||
|
- KnowledgeGraph.find_by_keywords(keywords) → get_related() 图遍历
|
||||||
|
3. 逐条评分(ScoringStrategy)
|
||||||
|
4. 过滤 score < min_score
|
||||||
|
5. 排序 → 截取 top-N
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. KeywordExtractor
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub trait KeywordExtractor: Send + Sync {
|
||||||
|
fn extract(&self, query: &str) -> Vec<String>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SimpleKeywordExtractor {
|
||||||
|
stop_words: HashSet<String>,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
默认实现:按非字母数字字符分割,过滤停用词和单字符词。停用词表应包含英语常用停用词(约 80-100 个)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. 基于召回价值的淘汰(RecallBased)
|
||||||
|
|
||||||
|
### 5.1 记忆价值评分
|
||||||
|
|
||||||
|
每条记忆维护召回统计,计算综合价值分数:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct RecallStats {
|
||||||
|
pub recall_count: u64, // 累计召回次数
|
||||||
|
pub total_score: f64, // 累计评分(平均分 = total_score / recall_count)
|
||||||
|
pub last_recall_at: i64, // 最后一次被召回的时间戳(秒)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 记忆价值公式:
|
||||||
|
// value = ln(1 + recall_count) × w_recall + avg_score × w_score + recency × w_recency
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.2 record_recall()
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// MemoryStore trait 可选方法
|
||||||
|
async fn record_recall(&self, id: &str, score: f32) -> Result<(), MemoryError> {
|
||||||
|
Ok(()) // 默认空实现,需覆盖
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.3 淘汰策略
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub enum EvictionPolicy {
|
||||||
|
// ...Phase 3 已有: None, Ttl, Capacity...
|
||||||
|
|
||||||
|
RecallBased {
|
||||||
|
max_items: usize,
|
||||||
|
recall_weight: f32, // 默认 0.3
|
||||||
|
score_weight: f32, // 默认 0.5
|
||||||
|
recency_weight: f32, // 默认 0.2
|
||||||
|
},
|
||||||
|
Hybrid {
|
||||||
|
ttl_secs: Option<u64>,
|
||||||
|
max_items: Option<usize>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Phase 3 → Phase 4 迁移建议
|
||||||
|
|
||||||
|
| 组件 | Phase 3 状态 | Phase 4 迁移方式 |
|
||||||
|
|------|-------------|-----------------|
|
||||||
|
| KnowledgeStore | 具体 struct(基于 MemoryStore) | 保持 struct,新增知识图谱数据入口 |
|
||||||
|
| KnowledgeGraph | 不存在 | 新建 `memory/graph.rs`,实现 trait + InMemoryGraph |
|
||||||
|
| MemoryRetriever | 单通道(仅 KnowledgeStore) | 增加 KnowledgeGraph 通道,恢复双通道检索 |
|
||||||
|
| ScoringStrategy | 内部 TextOverlap | 恢复枚举策略 + MemoryRetriever 配置 |
|
||||||
|
| KeywordExtractor | MemoryRetriever 内部拆分逻辑 | 抽取为独立 struct |
|
||||||
|
| RecallBased 淘汰 | 不存在 | 恢复 EvictionPolicy 变体 + RecallStats |
|
||||||
|
| 标签管理 | 不存在 | 恢复 tag_index + find_tags + set_entity_tags |
|
||||||
|
|
||||||
|
**关键依赖**:Phase 4 的 Agent 编排是 KnowledgeGraph 和标签管理的驱动者。如果没有 Agent 的 LLM 调用来提取标签、维护知识,KnowledgeGraph 只是一个空的图存储。
|
||||||
Reference in New Issue
Block a user