leslieo2
diff --git a/‎README.md‎
Lines changed: 26 additions & 0 deletions b/‎README.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎README_zh.md‎
Lines changed: 27 additions & 1 deletion b/‎README_zh.md‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎src/game/config.py‎
Lines changed: 4 additions & 3 deletions b/‎src/game/config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/game/llm_strategy.py‎
Lines changed: 1 addition & 1 deletion b/‎src/game/llm_strategy.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/game/metrics.py‎
Lines changed: 14 additions & 5 deletions b/‎src/game/metrics.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎src/tools/graph_viz.py‎
Lines changed: 1 addition & 1 deletion b/‎src/tools/graph_viz.py‎
Lines changed: 1 addition & 1 deletion
@@ -14,6 +14,7 @@ LieGraph is a multi-agent implementation of the popular social deduction game "W
 - **Natural Language Interaction:** Agents communicate and reason in natural language throughout the game
 - **Probabilistic Belief System:** Sophisticated belief tracking with self-belief confidence and suspicions matrix
 - **Strategic Reasoning:** Advanced bluff detection, alliance formation, and long-term planning
+- **Built-in Metrics:** Automatic quality tracking for win balance, identification accuracy, and speech diversity with JSON reports for prompt evaluation workflows
 
 ## 🚀 Quick Start
 
@@ -158,6 +159,31 @@ game:
  # ...
 ```
 
+## 📊 Metrics & Evaluation
+
+LieGraph ships with a lightweight metrics collector (`src/game/metrics.py`) that records quality indicators as games unfold:
+
+- **Win balance:** Civilian vs. spy win rates and a fairness score targeting 50/50 outcomes.
+- **Identification accuracy:** Tracks how confidently players identify their own roles and others over time.
+- **Speech diversity:** Measures lexical variety per speech turn to surface repetitive phrasing.
+
+Metrics are streamed to memory during play and automatically persisted when a game ends:
+
+- Per-game summaries: `logs/metrics/{game_id}.json`
+- Rolling aggregate + functional quality score: `logs/metrics/overall.json`
+
+You can also access the live collector from code:
+
+```python
+from src.game.metrics import metrics_collector
+
+audit = metrics_collector.get_overall_metrics()
+score = metrics_collector.compute_quality_score() # deterministic
+# metrics_collector.compute_quality_score(method="llm", llm=client) for LLM-based review
+```
+
+These outputs are ready to feed into downstream prompt-evaluation or offline analysis pipelines.
+
 ## 🛠️ Development
 
 ### Project Structure
 
@@ -15,6 +15,7 @@ LieGraph 是基于 LangGraph 构建的流行社交推理游戏 "谁是卧底"
 - **自然语言交互:** 智能体在整个游戏中以自然语言进行交流和推理
 - **概率信念系统:** 具有自我信念置信度和怀疑矩阵的复杂信念追踪系统
 - **战略推理:** 高级的虚张声势检测、联盟形成和长期规划
+- **内建指标:** 自动追踪胜率平衡、身份识别准确率与发言多样性，并生成 JSON 报告以供后续提示词评估使用
 
 ## 🚀 快速开始
 
@@ -32,6 +33,31 @@ LieGraph 是基于 LangGraph 构建的流行社交推理游戏 "谁是卧底"
 touch .env
 ```
 
+## 📊 指标与评估
+
+项目内置一个轻量的指标收集器（`src/game/metrics.py`），在游戏过程中实时记录以下指标：
+
+- **胜率平衡：** 跟踪平民与卧底胜率以及目标为 50/50 的公平度得分。
+- **身份识别准确率：** 监测玩家对自身及他人身份判断的准确趋势。
+- **发言多样性：** 按回合统计词汇多样性，帮助发现重复或单调的发言。
+
+当游戏结束时，指标会自动写入：
+
+- 单局摘要：`logs/metrics/{game_id}.json`
+- 全局聚合与函数版总分：`logs/metrics/overall.json`
+
+如需在代码中获取实时数据，可直接调用：
+
+```python
+from src.game.metrics import metrics_collector
+
+report = metrics_collector.get_overall_metrics()
+score = metrics_collector.compute_quality_score() # 函数评分
+# metrics_collector.compute_quality_score(method="llm", llm=client) 可获取 LLM 评价
+```
+
+这些成果可以直接用于后续的提示词评估或离线分析流程。
+
 **OpenAI 配置示例:**
 ```
 LLM_PROVIDER=openai
@@ -203,4 +229,4 @@ python -m pytest tests/ -v
 
 ## 📄 许可证
 
-本项目采用 MIT 许可证 - 详见 [LICENSE](LICENSE) 文件。
+本项目采用 MIT 许可证 - 详见 [LICENSE](LICENSE) 文件。
@@ -22,10 +22,10 @@
 - Player name pool management
 """
 
-import random
+import os
 from typing import List, Tuple
+
 import yaml
-import os
 
 
 class GameConfig:
@@ -50,7 +50,8 @@ def _load_config(self) -> dict:
  # Return default configuration
  return self._get_default_config()
 
- def _get_default_config(self) -> dict:
+ @staticmethod
+ def _get_default_config() -> dict:
  """Get default configuration."""
  return {
  "game": {
 
@@ -27,7 +27,7 @@
 
 from trustcall import create_extractor
 
-from src.game.state import GameState, Speech, PlayerMindset, Suspicion, Vote, SelfBelief
+from src.game.state import GameState, Speech, PlayerMindset, Suspicion, SelfBelief
 from src.tools.llm import create_llm
 
 # Game rules are now managed by the configuration system
 
@@ -333,7 +333,8 @@ def _summarize_game(self, game: Dict[str, Any]) -> Dict[str, Any]:
  "speech_diversity": speech_summary,
  }
 
- def _trend(self, round_metrics: Dict[int, Dict[str, Optional[float]]], *, key: str):
+ @staticmethod
+ def _trend(round_metrics: Dict[int, Dict[str, Optional[float]]], *, key: str):
  if not round_metrics:
  return None
 
@@ -347,7 +348,8 @@ def _trend(self, round_metrics: Dict[int, Dict[str, Optional[float]]], *, key: s
  return None
  return last - first
 
- def _summarize_speeches(self, speeches: List[SpeechRecord]) -> Dict[str, Any]:
+ @staticmethod
+ def _summarize_speeches(speeches: List[SpeechRecord]) -> Dict[str, Any]:
  if not speeches:
  return {
  "average_diversity": 0.0,
@@ -471,7 +473,8 @@ def _aggregate_speech_metrics(self) -> Dict[str, Any]:
  "by_player": per_player,
  }
 
- def _compute_functional_score(self, summary: Dict[str, Any]) -> Dict[str, float]:
+ @staticmethod
+ def _compute_functional_score(summary: Dict[str, Any]) -> Dict[str, float]:
  win_balance = summary.get("win_balance_score", 0.0)
  identification = summary.get("identification", {})
  speech = summary.get("speech_diversity", {})
@@ -501,7 +504,8 @@ def _compute_functional_score(self, summary: Dict[str, Any]) -> Dict[str, float]
  "speech_diversity": round(speech_component, 4),
  }
 
- def _format_summary_for_llm(self, summary: Dict[str, Any]) -> Dict[str, Any]:
+ @staticmethod
+ def _format_summary_for_llm(summary: Dict[str, Any]) -> Dict[str, Any]:
  """Format metrics into an instruction for an LLM reviewer."""
  instructions = (
  "You are evaluating the quality of repeated 'Who Is Spy' games. "
@@ -524,7 +528,12 @@ def _persist_game_summary(self, summary: Dict[str, Any]) -> None:
 
  def _persist_overall_metrics(self) -> None:
  path = self._output_dir / "overall.json"
- payload = self.get_overall_metrics()
+ summary = self.get_overall_metrics()
+ score = self._compute_functional_score(summary)
+ payload = {
+ "metrics": summary,
+ "quality_score": score,
+ }
  with path.open("w", encoding="utf-8") as fp:
  json.dump(payload, fp, ensure_ascii=False, indent=2)
 
 
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Any, Protocol
+from typing import Protocol
 
 try: # Imported lazily so pure-Python usage works without IPython.
  from IPython import get_ipython