using System.IO; using System.Text; using System.Text.RegularExpressions; using Microsoft.Data.Sqlite; namespace AxCopilot.Services; public partial class CodeIndexService { // ── 검색 + TF-IDF + Dispose ────────────────────────────────────────── // ── 검색 ──────────────────────────────────────────────────────────── /// 시맨틱 검색: 질문과 가장 관련 있는 코드 청크를 반환합니다. public List Search(string query, int maxResults = 5) { if (!_indexed || _db == null || _totalDocs == 0) return new(); var queryTokens = Tokenize(query); if (queryTokens.Count == 0) return new(); // 쿼리 토큰의 DF 조회 var dfMap = new Dictionary(); foreach (var token in queryTokens.Keys) { using var cmd = _db.CreateCommand(); cmd.CommandText = "SELECT df FROM doc_freq WHERE token = @t"; cmd.Parameters.AddWithValue("@t", token); var result = cmd.ExecuteScalar(); if (result != null) dfMap[token] = Convert.ToInt32(result); } // 후보 청크 검색: 쿼리 토큰이 하나라도 포함된 청크만 var candidateChunks = new HashSet(); foreach (var token in queryTokens.Keys) { using var cmd = _db.CreateCommand(); cmd.CommandText = "SELECT DISTINCT chunk_id FROM tokens WHERE token = @t"; cmd.Parameters.AddWithValue("@t", token); using var reader = cmd.ExecuteReader(); while (reader.Read()) candidateChunks.Add(reader.GetInt32(0)); } if (candidateChunks.Count == 0) return new(); // 각 후보 청크의 TF-IDF 유사도 계산 var scored = new List<(int ChunkId, double Score)>(); foreach (var chunkId in candidateChunks) { // 청크의 토큰 TF 로드 var docTf = new Dictionary(StringComparer.OrdinalIgnoreCase); using (var cmd = _db.CreateCommand()) { cmd.CommandText = "SELECT token, tf FROM tokens WHERE chunk_id = @cid"; cmd.Parameters.AddWithValue("@cid", chunkId); using var reader = cmd.ExecuteReader(); while (reader.Read()) docTf[reader.GetString(0)] = reader.GetInt32(1); } var score = ComputeTfIdfSimilarity(queryTokens, docTf, dfMap); if (score > 0.01) scored.Add((chunkId, score)); } // 상위 결과 추출 var topChunks = scored .OrderByDescending(s => s.Score) .Take(maxResults) .ToList(); var results = new List(); foreach (var (chunkId, score) in topChunks) { using var cmd = _db.CreateCommand(); cmd.CommandText = """ SELECT f.path, c.start_line, c.end_line, c.content FROM chunks c JOIN files f ON c.file_id = f.id WHERE c.id = @cid """; cmd.Parameters.AddWithValue("@cid", chunkId); using var reader = cmd.ExecuteReader(); if (reader.Read()) { results.Add(new SearchResult { FilePath = reader.GetString(0), StartLine = reader.GetInt32(1), EndLine = reader.GetInt32(2), Score = score, Preview = reader.GetString(3) is { Length: > 200 } s ? s[..200] + "..." : reader.GetString(3), }); } } return results; } /// 기존 인덱스가 있으면 로드합니다 (앱 재시작 시). public void TryLoadExisting(string workFolder) { if (string.IsNullOrEmpty(workFolder) || !Directory.Exists(workFolder)) return; var dbPath = GetDbPath(workFolder); if (!File.Exists(dbPath)) return; EnsureDb(workFolder); _totalDocs = GetTotalChunkCount(); _indexed = _totalDocs > 0; if (_indexed) LogService.Info($"기존 코드 인덱스 로드: {_totalDocs}개 청크 [{workFolder}]"); } // ── TF-IDF 계산 ───────────────────────────────────────────────────── private double ComputeTfIdfSimilarity( Dictionary queryTf, Dictionary docTf, Dictionary dfMap) { double dotProduct = 0, queryNorm = 0, docNorm = 0; foreach (var (token, qtf) in queryTf) { var df = dfMap.GetValueOrDefault(token, 0); var idf = Math.Log(1.0 + _totalDocs / (1.0 + df)); var qWeight = qtf * idf; queryNorm += qWeight * qWeight; if (docTf.TryGetValue(token, out var dtf)) { var dWeight = dtf * idf; dotProduct += qWeight * dWeight; } } foreach (var (token, dtf) in docTf) { var df = dfMap.GetValueOrDefault(token, 0); var idf = Math.Log(1.0 + _totalDocs / (1.0 + df)); var dWeight = dtf * idf; docNorm += dWeight * dWeight; } if (queryNorm == 0 || docNorm == 0) return 0; return dotProduct / (Math.Sqrt(queryNorm) * Math.Sqrt(docNorm)); } // ── 토큰화 ────────────────────────────────────────────────────────── /// 텍스트를 토큰으로 분할하고 빈도를 계산합니다. 스톱워드 제거 포함. private static Dictionary Tokenize(string text) { var tf = new Dictionary(StringComparer.OrdinalIgnoreCase); var words = Regex.Split(text, @"[^a-zA-Z0-9가-힣_]+") .SelectMany(SplitCamelCase) .Where(w => w.Length >= 2 && !StopWords.Contains(w)); foreach (var word in words) { var lower = word.ToLowerInvariant(); tf.TryGetValue(lower, out var count); tf[lower] = count + 1; } // 바이그램 추가 (구문 검색 품질 향상) var wordList = words.Select(w => w.ToLowerInvariant()).ToList(); for (int i = 0; i < wordList.Count - 1; i++) { var bigram = $"{wordList[i]}_{wordList[i + 1]}"; tf.TryGetValue(bigram, out var bc); tf[bigram] = bc + 1; } return tf; } private static IEnumerable SplitCamelCase(string word) { if (string.IsNullOrEmpty(word)) yield break; var sb = new StringBuilder(); foreach (var ch in word) { if (char.IsUpper(ch) && sb.Length > 0) { yield return sb.ToString(); sb.Clear(); } sb.Append(ch); } if (sb.Length > 0) yield return sb.ToString(); } // ── Dispose ───────────────────────────────────────────────────────── public void Dispose() { _db?.Dispose(); _db = null; } }