using System.IO; using System.Text; using System.Text.RegularExpressions; using Microsoft.Data.Sqlite; namespace AxCopilot.Services; /// /// 프로젝트 코드베이스 인덱싱 및 시맨틱 검색 서비스. /// TF-IDF 기반 유사도 검색을 SQLite에 영속 저장하여 /// 증분 업데이트와 빠른 재시작을 지원합니다. /// (로컬 전용, 외부 서버 불필요) /// public class CodeIndexService : IDisposable { private SqliteConnection? _db; private string _workFolder = ""; private bool _indexed; private int _totalDocs; public bool IsIndexed => _indexed; public int ChunkCount => _totalDocs; // ── 스톱워드 (TF-IDF 정확도 향상) ────────────────────────────────── private static readonly HashSet StopWords = new(StringComparer.OrdinalIgnoreCase) { // 영어 공통 "the", "is", "at", "of", "on", "and", "or", "not", "in", "to", "for", "it", "be", "as", "do", "by", "this", "that", "with", "from", "but", "an", "are", "was", "were", "been", "being", "have", "has", "had", "if", "else", "then", "than", "so", "no", "yes", // 프로그래밍 공통 (너무 빈번해서 변별력 없음) "var", "int", "string", "void", "null", "new", "return", "get", "set", "public", "private", "class", "static", "using", "namespace", "true", "false", "import", "export", "function", "const", "let", "def", "self", }; private static readonly HashSet CodeExtensions = new(StringComparer.OrdinalIgnoreCase) { ".cs", ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".cpp", ".c", ".h", ".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala", ".html", ".css", ".scss", ".json", ".xml", ".yaml", ".yml", ".md", ".txt", ".sql", ".sh", ".bat", ".ps1", ".csproj", ".sln", ".gradle", ".pom", }; // ── DB 초기화 ─────────────────────────────────────────────────────── private string GetDbPath(string workFolder) { // %APPDATA%\AxCopilot\index\{folderHash}.db var hash = Convert.ToHexString( System.Security.Cryptography.SHA256.HashData( Encoding.UTF8.GetBytes(workFolder.ToLowerInvariant())))[..16]; var dir = Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), "AxCopilot", "index"); Directory.CreateDirectory(dir); return Path.Combine(dir, $"{hash}.db"); } private void EnsureDb(string workFolder) { if (_db != null && _workFolder == workFolder) return; _db?.Dispose(); _workFolder = workFolder; var dbPath = GetDbPath(workFolder); _db = new SqliteConnection($"Data Source={dbPath}"); _db.Open(); // WAL 모드 (동시 읽기/쓰기 성능) using (var cmd = _db.CreateCommand()) { cmd.CommandText = "PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;"; cmd.ExecuteNonQuery(); } // 테이블 생성 using var create = _db.CreateCommand(); create.CommandText = """ CREATE TABLE IF NOT EXISTS files ( id INTEGER PRIMARY KEY AUTOINCREMENT, path TEXT NOT NULL UNIQUE, last_modified TEXT NOT NULL, file_size INTEGER NOT NULL ); CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY AUTOINCREMENT, file_id INTEGER NOT NULL, start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, content TEXT NOT NULL, FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE ); CREATE TABLE IF NOT EXISTS tokens ( chunk_id INTEGER NOT NULL, token TEXT NOT NULL, tf INTEGER NOT NULL, FOREIGN KEY (chunk_id) REFERENCES chunks(id) ON DELETE CASCADE ); CREATE TABLE IF NOT EXISTS doc_freq ( token TEXT PRIMARY KEY, df INTEGER NOT NULL ); CREATE TABLE IF NOT EXISTS meta ( key TEXT PRIMARY KEY, value TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_tokens_chunk ON tokens(chunk_id); CREATE INDEX IF NOT EXISTS idx_tokens_token ON tokens(token); CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id); """; create.ExecuteNonQuery(); } // ── 인덱싱 ────────────────────────────────────────────────────────── /// 작업 폴더의 코드 파일을 인덱싱합니다. 증분 업데이트 지원. public async Task IndexAsync(string workFolder, CancellationToken ct = default) { if (string.IsNullOrEmpty(workFolder) || !Directory.Exists(workFolder)) return; EnsureDb(workFolder); await Task.Run(() => { var existingFiles = LoadExistingFiles(); // 설정에서 최대 파일 크기 조회 var maxFileKb = 500; try { var app = System.Windows.Application.Current as App; var cfgMax = app?.SettingsService?.Settings.Llm.Code.CodeIndexMaxFileKb ?? 500; if (cfgMax > 0) maxFileKb = cfgMax; } catch { } var currentFiles = ScanFiles(workFolder, maxFileKb); int added = 0, updated = 0, removed = 0; // 삭제된 파일 제거 foreach (var (path, fileId) in existingFiles) { if (!currentFiles.ContainsKey(path)) { RemoveFileFromIndex(fileId); removed++; } } // 신규/변경 파일 인덱싱 foreach (var (relPath, info) in currentFiles) { if (ct.IsCancellationRequested) break; var lastMod = info.LastWriteTimeUtc.ToString("O"); var size = info.Length; if (existingFiles.TryGetValue(relPath, out var fileId)) { // 기존 파일 — 변경 여부 확인 if (!IsFileChanged(fileId, lastMod, size)) continue; RemoveFileFromIndex(fileId); updated++; } else { added++; } IndexFile(workFolder, relPath, lastMod, size); } // DF 테이블 재계산 RebuildDocFreq(); // 총 청크 수 캐시 _totalDocs = GetTotalChunkCount(); _indexed = _totalDocs > 0; // 메타 저장 SaveMeta("lastIndexed", DateTime.UtcNow.ToString("O")); SaveMeta("workFolder", workFolder); LogService.Info($"코드 인덱싱 완료: {_totalDocs}개 청크 (추가:{added} 갱신:{updated} 삭제:{removed}) [{workFolder}]"); }, ct); } private Dictionary LoadExistingFiles() { var dict = new Dictionary(StringComparer.OrdinalIgnoreCase); using var cmd = _db!.CreateCommand(); cmd.CommandText = "SELECT id, path FROM files"; using var reader = cmd.ExecuteReader(); while (reader.Read()) dict[reader.GetString(1)] = reader.GetInt32(0); return dict; } private Dictionary ScanFiles(string workFolder, int maxFileKb = 500) { var dict = new Dictionary(StringComparer.OrdinalIgnoreCase); var maxBytes = (long)maxFileKb * 1024; try { var files = Directory.EnumerateFiles(workFolder, "*.*", new EnumerationOptions { RecurseSubdirectories = true, IgnoreInaccessible = true, MaxRecursionDepth = 8, }); foreach (var file in files) { var ext = Path.GetExtension(file); if (!CodeExtensions.Contains(ext)) continue; try { var info = new FileInfo(file); if (info.Length > maxBytes) continue; var relPath = Path.GetRelativePath(workFolder, file); dict[relPath] = info; } catch { } } } catch { } return dict; } private bool IsFileChanged(int fileId, string lastMod, long size) { using var cmd = _db!.CreateCommand(); cmd.CommandText = "SELECT last_modified, file_size FROM files WHERE id = @id"; cmd.Parameters.AddWithValue("@id", fileId); using var reader = cmd.ExecuteReader(); if (!reader.Read()) return true; return reader.GetString(0) != lastMod || reader.GetInt64(1) != size; } private void RemoveFileFromIndex(int fileId) { // 청크 ID 목록 var chunkIds = new List(); using (var cmd = _db!.CreateCommand()) { cmd.CommandText = "SELECT id FROM chunks WHERE file_id = @fid"; cmd.Parameters.AddWithValue("@fid", fileId); using var reader = cmd.ExecuteReader(); while (reader.Read()) chunkIds.Add(reader.GetInt32(0)); } // 토큰 삭제 foreach (var cid in chunkIds) { using var cmd = _db!.CreateCommand(); cmd.CommandText = "DELETE FROM tokens WHERE chunk_id = @cid"; cmd.Parameters.AddWithValue("@cid", cid); cmd.ExecuteNonQuery(); } // 청크 삭제 using (var cmd = _db!.CreateCommand()) { cmd.CommandText = "DELETE FROM chunks WHERE file_id = @fid"; cmd.Parameters.AddWithValue("@fid", fileId); cmd.ExecuteNonQuery(); } // 파일 삭제 using (var cmd = _db!.CreateCommand()) { cmd.CommandText = "DELETE FROM files WHERE id = @fid"; cmd.Parameters.AddWithValue("@fid", fileId); cmd.ExecuteNonQuery(); } } private void IndexFile(string workFolder, string relPath, string lastMod, long size) { try { var fullPath = Path.Combine(workFolder, relPath); var content = File.ReadAllText(fullPath, Encoding.UTF8); // 파일 등록 int fileId; using (var cmd = _db!.CreateCommand()) { cmd.CommandText = "INSERT INTO files (path, last_modified, file_size) VALUES (@p, @m, @s) RETURNING id"; cmd.Parameters.AddWithValue("@p", relPath); cmd.Parameters.AddWithValue("@m", lastMod); cmd.Parameters.AddWithValue("@s", size); fileId = Convert.ToInt32(cmd.ExecuteScalar()); } // 청크 분할 (50라인씩) var lines = content.Split('\n'); using var tx = _db!.BeginTransaction(); for (int i = 0; i < lines.Length; i += 50) { var chunkLines = lines.AsSpan(i, Math.Min(50, lines.Length - i)); var chunkText = string.Join("\n", chunkLines.ToArray()); if (string.IsNullOrWhiteSpace(chunkText)) continue; var endLine = Math.Min(i + 50, lines.Length); // 청크 저장 int chunkId; using (var cmd = _db!.CreateCommand()) { cmd.Transaction = tx; cmd.CommandText = "INSERT INTO chunks (file_id, start_line, end_line, content) VALUES (@f, @s, @e, @c) RETURNING id"; cmd.Parameters.AddWithValue("@f", fileId); cmd.Parameters.AddWithValue("@s", i + 1); cmd.Parameters.AddWithValue("@e", endLine); cmd.Parameters.AddWithValue("@c", chunkText); chunkId = Convert.ToInt32(cmd.ExecuteScalar()); } // 토큰 저장 var tokens = Tokenize(chunkText); foreach (var (token, tf) in tokens) { using var cmd = _db!.CreateCommand(); cmd.Transaction = tx; cmd.CommandText = "INSERT INTO tokens (chunk_id, token, tf) VALUES (@cid, @t, @tf)"; cmd.Parameters.AddWithValue("@cid", chunkId); cmd.Parameters.AddWithValue("@t", token); cmd.Parameters.AddWithValue("@tf", tf); cmd.ExecuteNonQuery(); } } tx.Commit(); } catch { /* 읽기 실패 파일 건너뛰기 */ } } private void RebuildDocFreq() { using var cmd = _db!.CreateCommand(); cmd.CommandText = """ DELETE FROM doc_freq; INSERT INTO doc_freq (token, df) SELECT token, COUNT(DISTINCT chunk_id) FROM tokens GROUP BY token; """; cmd.ExecuteNonQuery(); } private int GetTotalChunkCount() { using var cmd = _db!.CreateCommand(); cmd.CommandText = "SELECT COUNT(*) FROM chunks"; return Convert.ToInt32(cmd.ExecuteScalar()); } private void SaveMeta(string key, string value) { using var cmd = _db!.CreateCommand(); cmd.CommandText = "INSERT OR REPLACE INTO meta (key, value) VALUES (@k, @v)"; cmd.Parameters.AddWithValue("@k", key); cmd.Parameters.AddWithValue("@v", value); cmd.ExecuteNonQuery(); } // ── 검색 ──────────────────────────────────────────────────────────── /// 시맨틱 검색: 질문과 가장 관련 있는 코드 청크를 반환합니다. public List Search(string query, int maxResults = 5) { if (!_indexed || _db == null || _totalDocs == 0) return new(); var queryTokens = Tokenize(query); if (queryTokens.Count == 0) return new(); // 쿼리 토큰의 DF 조회 var dfMap = new Dictionary(); foreach (var token in queryTokens.Keys) { using var cmd = _db.CreateCommand(); cmd.CommandText = "SELECT df FROM doc_freq WHERE token = @t"; cmd.Parameters.AddWithValue("@t", token); var result = cmd.ExecuteScalar(); if (result != null) dfMap[token] = Convert.ToInt32(result); } // 후보 청크 검색: 쿼리 토큰이 하나라도 포함된 청크만 var candidateChunks = new HashSet(); foreach (var token in queryTokens.Keys) { using var cmd = _db.CreateCommand(); cmd.CommandText = "SELECT DISTINCT chunk_id FROM tokens WHERE token = @t"; cmd.Parameters.AddWithValue("@t", token); using var reader = cmd.ExecuteReader(); while (reader.Read()) candidateChunks.Add(reader.GetInt32(0)); } if (candidateChunks.Count == 0) return new(); // 각 후보 청크의 TF-IDF 유사도 계산 var scored = new List<(int ChunkId, double Score)>(); foreach (var chunkId in candidateChunks) { // 청크의 토큰 TF 로드 var docTf = new Dictionary(StringComparer.OrdinalIgnoreCase); using (var cmd = _db.CreateCommand()) { cmd.CommandText = "SELECT token, tf FROM tokens WHERE chunk_id = @cid"; cmd.Parameters.AddWithValue("@cid", chunkId); using var reader = cmd.ExecuteReader(); while (reader.Read()) docTf[reader.GetString(0)] = reader.GetInt32(1); } var score = ComputeTfIdfSimilarity(queryTokens, docTf, dfMap); if (score > 0.01) scored.Add((chunkId, score)); } // 상위 결과 추출 var topChunks = scored .OrderByDescending(s => s.Score) .Take(maxResults) .ToList(); var results = new List(); foreach (var (chunkId, score) in topChunks) { using var cmd = _db.CreateCommand(); cmd.CommandText = """ SELECT f.path, c.start_line, c.end_line, c.content FROM chunks c JOIN files f ON c.file_id = f.id WHERE c.id = @cid """; cmd.Parameters.AddWithValue("@cid", chunkId); using var reader = cmd.ExecuteReader(); if (reader.Read()) { results.Add(new SearchResult { FilePath = reader.GetString(0), StartLine = reader.GetInt32(1), EndLine = reader.GetInt32(2), Score = score, Preview = reader.GetString(3) is { Length: > 200 } s ? s[..200] + "..." : reader.GetString(3), }); } } return results; } /// 기존 인덱스가 있으면 로드합니다 (앱 재시작 시). public void TryLoadExisting(string workFolder) { if (string.IsNullOrEmpty(workFolder) || !Directory.Exists(workFolder)) return; var dbPath = GetDbPath(workFolder); if (!File.Exists(dbPath)) return; EnsureDb(workFolder); _totalDocs = GetTotalChunkCount(); _indexed = _totalDocs > 0; if (_indexed) LogService.Info($"기존 코드 인덱스 로드: {_totalDocs}개 청크 [{workFolder}]"); } // ── TF-IDF 계산 ───────────────────────────────────────────────────── private double ComputeTfIdfSimilarity( Dictionary queryTf, Dictionary docTf, Dictionary dfMap) { double dotProduct = 0, queryNorm = 0, docNorm = 0; foreach (var (token, qtf) in queryTf) { var df = dfMap.GetValueOrDefault(token, 0); var idf = Math.Log(1.0 + _totalDocs / (1.0 + df)); var qWeight = qtf * idf; queryNorm += qWeight * qWeight; if (docTf.TryGetValue(token, out var dtf)) { var dWeight = dtf * idf; dotProduct += qWeight * dWeight; } } foreach (var (token, dtf) in docTf) { var df = dfMap.GetValueOrDefault(token, 0); var idf = Math.Log(1.0 + _totalDocs / (1.0 + df)); var dWeight = dtf * idf; docNorm += dWeight * dWeight; } if (queryNorm == 0 || docNorm == 0) return 0; return dotProduct / (Math.Sqrt(queryNorm) * Math.Sqrt(docNorm)); } // ── 토큰화 ────────────────────────────────────────────────────────── /// 텍스트를 토큰으로 분할하고 빈도를 계산합니다. 스톱워드 제거 포함. private static Dictionary Tokenize(string text) { var tf = new Dictionary(StringComparer.OrdinalIgnoreCase); var words = Regex.Split(text, @"[^a-zA-Z0-9가-힣_]+") .SelectMany(SplitCamelCase) .Where(w => w.Length >= 2 && !StopWords.Contains(w)); foreach (var word in words) { var lower = word.ToLowerInvariant(); tf.TryGetValue(lower, out var count); tf[lower] = count + 1; } // 바이그램 추가 (구문 검색 품질 향상) var wordList = words.Select(w => w.ToLowerInvariant()).ToList(); for (int i = 0; i < wordList.Count - 1; i++) { var bigram = $"{wordList[i]}_{wordList[i + 1]}"; tf.TryGetValue(bigram, out var bc); tf[bigram] = bc + 1; } return tf; } private static IEnumerable SplitCamelCase(string word) { if (string.IsNullOrEmpty(word)) yield break; var sb = new StringBuilder(); foreach (var ch in word) { if (char.IsUpper(ch) && sb.Length > 0) { yield return sb.ToString(); sb.Clear(); } sb.Append(ch); } if (sb.Length > 0) yield return sb.ToString(); } // ── Dispose ───────────────────────────────────────────────────────── public void Dispose() { _db?.Dispose(); _db = null; } } /// 인덱싱된 코드 청크. public class CodeChunk { public string FilePath { get; init; } = ""; public int StartLine { get; init; } public int EndLine { get; init; } public string Content { get; init; } = ""; public Dictionary Tokens { get; init; } = new(); } /// 검색 결과. public class SearchResult { public string FilePath { get; init; } = ""; public int StartLine { get; init; } public int EndLine { get; init; } public double Score { get; init; } public string Preview { get; init; } = ""; public override string ToString() => $"{FilePath}:{StartLine}-{EndLine} (score: {Score:F3})"; }