using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Data.Sqlite;
namespace AxCopilot.Services;
public partial class CodeIndexService
{
// ── 검색 + TF-IDF + Dispose ──────────────────────────────────────────
// ── 검색 ────────────────────────────────────────────────────────────
/// 시맨틱 검색: 질문과 가장 관련 있는 코드 청크를 반환합니다.
public List Search(string query, int maxResults = 5)
{
if (!_indexed || _db == null || _totalDocs == 0)
return new();
var queryTokens = Tokenize(query);
if (queryTokens.Count == 0) return new();
// 쿼리 토큰의 DF 조회
var dfMap = new Dictionary();
foreach (var token in queryTokens.Keys)
{
using var cmd = _db.CreateCommand();
cmd.CommandText = "SELECT df FROM doc_freq WHERE token = @t";
cmd.Parameters.AddWithValue("@t", token);
var result = cmd.ExecuteScalar();
if (result != null) dfMap[token] = Convert.ToInt32(result);
}
// 후보 청크 검색: 쿼리 토큰이 하나라도 포함된 청크만
var candidateChunks = new HashSet();
foreach (var token in queryTokens.Keys)
{
using var cmd = _db.CreateCommand();
cmd.CommandText = "SELECT DISTINCT chunk_id FROM tokens WHERE token = @t";
cmd.Parameters.AddWithValue("@t", token);
using var reader = cmd.ExecuteReader();
while (reader.Read()) candidateChunks.Add(reader.GetInt32(0));
}
if (candidateChunks.Count == 0) return new();
// 각 후보 청크의 TF-IDF 유사도 계산
var scored = new List<(int ChunkId, double Score)>();
foreach (var chunkId in candidateChunks)
{
// 청크의 토큰 TF 로드
var docTf = new Dictionary(StringComparer.OrdinalIgnoreCase);
using (var cmd = _db.CreateCommand())
{
cmd.CommandText = "SELECT token, tf FROM tokens WHERE chunk_id = @cid";
cmd.Parameters.AddWithValue("@cid", chunkId);
using var reader = cmd.ExecuteReader();
while (reader.Read())
docTf[reader.GetString(0)] = reader.GetInt32(1);
}
var score = ComputeTfIdfSimilarity(queryTokens, docTf, dfMap);
if (score > 0.01)
scored.Add((chunkId, score));
}
// 상위 결과 추출
var topChunks = scored
.OrderByDescending(s => s.Score)
.Take(maxResults)
.ToList();
var results = new List();
foreach (var (chunkId, score) in topChunks)
{
using var cmd = _db.CreateCommand();
cmd.CommandText = """
SELECT f.path, c.start_line, c.end_line, c.content
FROM chunks c JOIN files f ON c.file_id = f.id
WHERE c.id = @cid
""";
cmd.Parameters.AddWithValue("@cid", chunkId);
using var reader = cmd.ExecuteReader();
if (reader.Read())
{
results.Add(new SearchResult
{
FilePath = reader.GetString(0),
StartLine = reader.GetInt32(1),
EndLine = reader.GetInt32(2),
Score = score,
Preview = reader.GetString(3) is { Length: > 200 } s ? s[..200] + "..." : reader.GetString(3),
});
}
}
return results;
}
/// 기존 인덱스가 있으면 로드합니다 (앱 재시작 시).
public void TryLoadExisting(string workFolder)
{
if (string.IsNullOrEmpty(workFolder) || !Directory.Exists(workFolder)) return;
var dbPath = GetDbPath(workFolder);
if (!File.Exists(dbPath)) return;
EnsureDb(workFolder);
_totalDocs = GetTotalChunkCount();
_indexed = _totalDocs > 0;
if (_indexed)
LogService.Info($"기존 코드 인덱스 로드: {_totalDocs}개 청크 [{workFolder}]");
}
// ── TF-IDF 계산 ─────────────────────────────────────────────────────
private double ComputeTfIdfSimilarity(
Dictionary queryTf,
Dictionary docTf,
Dictionary dfMap)
{
double dotProduct = 0, queryNorm = 0, docNorm = 0;
foreach (var (token, qtf) in queryTf)
{
var df = dfMap.GetValueOrDefault(token, 0);
var idf = Math.Log(1.0 + _totalDocs / (1.0 + df));
var qWeight = qtf * idf;
queryNorm += qWeight * qWeight;
if (docTf.TryGetValue(token, out var dtf))
{
var dWeight = dtf * idf;
dotProduct += qWeight * dWeight;
}
}
foreach (var (token, dtf) in docTf)
{
var df = dfMap.GetValueOrDefault(token, 0);
var idf = Math.Log(1.0 + _totalDocs / (1.0 + df));
var dWeight = dtf * idf;
docNorm += dWeight * dWeight;
}
if (queryNorm == 0 || docNorm == 0) return 0;
return dotProduct / (Math.Sqrt(queryNorm) * Math.Sqrt(docNorm));
}
// ── 토큰화 ──────────────────────────────────────────────────────────
/// 텍스트를 토큰으로 분할하고 빈도를 계산합니다. 스톱워드 제거 포함.
private static Dictionary Tokenize(string text)
{
var tf = new Dictionary(StringComparer.OrdinalIgnoreCase);
var words = Regex.Split(text, @"[^a-zA-Z0-9가-힣_]+")
.SelectMany(SplitCamelCase)
.Where(w => w.Length >= 2 && !StopWords.Contains(w));
foreach (var word in words)
{
var lower = word.ToLowerInvariant();
tf.TryGetValue(lower, out var count);
tf[lower] = count + 1;
}
// 바이그램 추가 (구문 검색 품질 향상)
var wordList = words.Select(w => w.ToLowerInvariant()).ToList();
for (int i = 0; i < wordList.Count - 1; i++)
{
var bigram = $"{wordList[i]}_{wordList[i + 1]}";
tf.TryGetValue(bigram, out var bc);
tf[bigram] = bc + 1;
}
return tf;
}
private static IEnumerable SplitCamelCase(string word)
{
if (string.IsNullOrEmpty(word)) yield break;
var sb = new StringBuilder();
foreach (var ch in word)
{
if (char.IsUpper(ch) && sb.Length > 0)
{
yield return sb.ToString();
sb.Clear();
}
sb.Append(ch);
}
if (sb.Length > 0) yield return sb.ToString();
}
// ── Dispose ─────────────────────────────────────────────────────────
public void Dispose()
{
_db?.Dispose();
_db = null;
}
}