Files
AX-Copilot-Codex/.decompiledproj/AxCopilot/Services/CodeIndexService.cs

517 lines
17 KiB
C#

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows;
using Microsoft.Data.Sqlite;
namespace AxCopilot.Services;
public class CodeIndexService : IDisposable
{
private SqliteConnection? _db;
private string _workFolder = "";
private bool _indexed;
private int _totalDocs;
private static readonly HashSet<string> StopWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"the", "is", "at", "of", "on", "and", "or", "not", "in", "to",
"for", "it", "be", "as", "do", "by", "this", "that", "with", "from",
"but", "an", "are", "was", "were", "been", "being", "have", "has", "had",
"if", "else", "then", "than", "so", "no", "yes", "var", "int", "string",
"void", "null", "new", "return", "get", "set", "public", "private", "class", "static",
"using", "namespace", "true", "false", "import", "export", "function", "const", "let", "def",
"self"
};
private static readonly HashSet<string> CodeExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
".cs", ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".cpp", ".c", ".h",
".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala", ".html", ".css",
".scss", ".json", ".xml", ".yaml", ".yml", ".md", ".txt", ".sql", ".sh", ".bat",
".ps1", ".csproj", ".sln", ".gradle", ".pom"
};
public bool IsIndexed => _indexed;
public int ChunkCount => _totalDocs;
private string GetDbPath(string workFolder)
{
string text = Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(workFolder.ToLowerInvariant()))).Substring(0, 16);
string text2 = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), "AxCopilot", "index");
Directory.CreateDirectory(text2);
return Path.Combine(text2, text + ".db");
}
private void EnsureDb(string workFolder)
{
if (_db != null && _workFolder == workFolder)
{
return;
}
_db?.Dispose();
_workFolder = workFolder;
string dbPath = GetDbPath(workFolder);
_db = new SqliteConnection("Data Source=" + dbPath);
_db.Open();
using (SqliteCommand sqliteCommand = _db.CreateCommand())
{
sqliteCommand.CommandText = "PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;";
sqliteCommand.ExecuteNonQuery();
}
using SqliteCommand sqliteCommand2 = _db.CreateCommand();
sqliteCommand2.CommandText = " CREATE TABLE IF NOT EXISTS files (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n path TEXT NOT NULL UNIQUE,\n last_modified TEXT NOT NULL,\n file_size INTEGER NOT NULL\n );\n CREATE TABLE IF NOT EXISTS chunks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n file_id INTEGER NOT NULL,\n start_line INTEGER NOT NULL,\n end_line INTEGER NOT NULL,\n content TEXT NOT NULL,\n FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE\n );\n CREATE TABLE IF NOT EXISTS tokens (\n chunk_id INTEGER NOT NULL,\n token TEXT NOT NULL,\n tf INTEGER NOT NULL,\n FOREIGN KEY (chunk_id) REFERENCES chunks(id) ON DELETE CASCADE\n );\n CREATE TABLE IF NOT EXISTS doc_freq (\n token TEXT PRIMARY KEY,\n df INTEGER NOT NULL\n );\n CREATE TABLE IF NOT EXISTS meta (\n key TEXT PRIMARY KEY,\n value TEXT NOT NULL\n );\n CREATE INDEX IF NOT EXISTS idx_tokens_chunk ON tokens(chunk_id);\n CREATE INDEX IF NOT EXISTS idx_tokens_token ON tokens(token);\n CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id);";
sqliteCommand2.ExecuteNonQuery();
}
public async Task IndexAsync(string workFolder, CancellationToken ct = default(CancellationToken))
{
if (string.IsNullOrEmpty(workFolder) || !Directory.Exists(workFolder))
{
return;
}
EnsureDb(workFolder);
await Task.Run(delegate
{
Dictionary<string, int> dictionary = LoadExistingFiles();
int maxFileKb = 500;
try
{
int num = ((!(Application.Current is App app)) ? ((int?)null) : app.SettingsService?.Settings.Llm.Code.CodeIndexMaxFileKb) ?? 500;
if (num > 0)
{
maxFileKb = num;
}
}
catch
{
}
Dictionary<string, FileInfo> dictionary2 = ScanFiles(workFolder, maxFileKb);
int num2 = 0;
int num3 = 0;
int num4 = 0;
string key;
foreach (KeyValuePair<string, int> item in dictionary)
{
item.Deconstruct(out key, out var value);
string key2 = key;
int fileId = value;
if (!dictionary2.ContainsKey(key2))
{
RemoveFileFromIndex(fileId);
num4++;
}
}
foreach (KeyValuePair<string, FileInfo> item2 in dictionary2)
{
item2.Deconstruct(out key, out var value2);
string text = key;
FileInfo fileInfo = value2;
if (ct.IsCancellationRequested)
{
break;
}
string lastMod = fileInfo.LastWriteTimeUtc.ToString("O");
long length = fileInfo.Length;
if (dictionary.TryGetValue(text, out var value3))
{
if (!IsFileChanged(value3, lastMod, length))
{
continue;
}
RemoveFileFromIndex(value3);
num3++;
}
else
{
num2++;
}
IndexFile(workFolder, text, lastMod, length);
}
RebuildDocFreq();
_totalDocs = GetTotalChunkCount();
_indexed = _totalDocs > 0;
SaveMeta("lastIndexed", DateTime.UtcNow.ToString("O"));
SaveMeta("workFolder", workFolder);
LogService.Info($"코드 인덱싱 완료: {_totalDocs}개 청크 (추가:{num2} 갱신:{num3} 삭제:{num4}) [{workFolder}]");
}, ct);
}
private Dictionary<string, int> LoadExistingFiles()
{
Dictionary<string, int> dictionary = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
using SqliteCommand sqliteCommand = _db.CreateCommand();
sqliteCommand.CommandText = "SELECT id, path FROM files";
using SqliteDataReader sqliteDataReader = sqliteCommand.ExecuteReader();
while (sqliteDataReader.Read())
{
dictionary[sqliteDataReader.GetString(1)] = sqliteDataReader.GetInt32(0);
}
return dictionary;
}
private Dictionary<string, FileInfo> ScanFiles(string workFolder, int maxFileKb = 500)
{
Dictionary<string, FileInfo> dictionary = new Dictionary<string, FileInfo>(StringComparer.OrdinalIgnoreCase);
long num = (long)maxFileKb * 1024L;
try
{
IEnumerable<string> enumerable = Directory.EnumerateFiles(workFolder, "*.*", new EnumerationOptions
{
RecurseSubdirectories = true,
IgnoreInaccessible = true,
MaxRecursionDepth = 8
});
foreach (string item in enumerable)
{
string extension = Path.GetExtension(item);
if (!CodeExtensions.Contains(extension))
{
continue;
}
try
{
FileInfo fileInfo = new FileInfo(item);
if (fileInfo.Length <= num)
{
string relativePath = Path.GetRelativePath(workFolder, item);
dictionary[relativePath] = fileInfo;
}
}
catch
{
}
}
}
catch
{
}
return dictionary;
}
private bool IsFileChanged(int fileId, string lastMod, long size)
{
using SqliteCommand sqliteCommand = _db.CreateCommand();
sqliteCommand.CommandText = "SELECT last_modified, file_size FROM files WHERE id = @id";
sqliteCommand.Parameters.AddWithValue("@id", fileId);
using SqliteDataReader sqliteDataReader = sqliteCommand.ExecuteReader();
if (!sqliteDataReader.Read())
{
return true;
}
return sqliteDataReader.GetString(0) != lastMod || sqliteDataReader.GetInt64(1) != size;
}
private void RemoveFileFromIndex(int fileId)
{
List<int> list = new List<int>();
using (SqliteCommand sqliteCommand = _db.CreateCommand())
{
sqliteCommand.CommandText = "SELECT id FROM chunks WHERE file_id = @fid";
sqliteCommand.Parameters.AddWithValue("@fid", fileId);
using SqliteDataReader sqliteDataReader = sqliteCommand.ExecuteReader();
while (sqliteDataReader.Read())
{
list.Add(sqliteDataReader.GetInt32(0));
}
}
foreach (int item in list)
{
using SqliteCommand sqliteCommand2 = _db.CreateCommand();
sqliteCommand2.CommandText = "DELETE FROM tokens WHERE chunk_id = @cid";
sqliteCommand2.Parameters.AddWithValue("@cid", item);
sqliteCommand2.ExecuteNonQuery();
}
using (SqliteCommand sqliteCommand3 = _db.CreateCommand())
{
sqliteCommand3.CommandText = "DELETE FROM chunks WHERE file_id = @fid";
sqliteCommand3.Parameters.AddWithValue("@fid", fileId);
sqliteCommand3.ExecuteNonQuery();
}
using SqliteCommand sqliteCommand4 = _db.CreateCommand();
sqliteCommand4.CommandText = "DELETE FROM files WHERE id = @fid";
sqliteCommand4.Parameters.AddWithValue("@fid", fileId);
sqliteCommand4.ExecuteNonQuery();
}
private void IndexFile(string workFolder, string relPath, string lastMod, long size)
{
try
{
string path = Path.Combine(workFolder, relPath);
string text = File.ReadAllText(path, Encoding.UTF8);
int num;
using (SqliteCommand sqliteCommand = _db.CreateCommand())
{
sqliteCommand.CommandText = "INSERT INTO files (path, last_modified, file_size) VALUES (@p, @m, @s) RETURNING id";
sqliteCommand.Parameters.AddWithValue("@p", relPath);
sqliteCommand.Parameters.AddWithValue("@m", lastMod);
sqliteCommand.Parameters.AddWithValue("@s", size);
num = Convert.ToInt32(sqliteCommand.ExecuteScalar());
}
string[] array = text.Split('\n');
using SqliteTransaction sqliteTransaction = _db.BeginTransaction();
for (int i = 0; i < array.Length; i += 50)
{
string text2 = string.Join("\n", array.AsSpan(i, Math.Min(50, array.Length - i)).ToArray());
if (string.IsNullOrWhiteSpace(text2))
{
continue;
}
int num2 = Math.Min(i + 50, array.Length);
int num3;
using (SqliteCommand sqliteCommand2 = _db.CreateCommand())
{
sqliteCommand2.Transaction = sqliteTransaction;
sqliteCommand2.CommandText = "INSERT INTO chunks (file_id, start_line, end_line, content) VALUES (@f, @s, @e, @c) RETURNING id";
sqliteCommand2.Parameters.AddWithValue("@f", num);
sqliteCommand2.Parameters.AddWithValue("@s", i + 1);
sqliteCommand2.Parameters.AddWithValue("@e", num2);
sqliteCommand2.Parameters.AddWithValue("@c", text2);
num3 = Convert.ToInt32(sqliteCommand2.ExecuteScalar());
}
Dictionary<string, int> dictionary = Tokenize(text2);
foreach (var (value, num5) in dictionary)
{
using SqliteCommand sqliteCommand3 = _db.CreateCommand();
sqliteCommand3.Transaction = sqliteTransaction;
sqliteCommand3.CommandText = "INSERT INTO tokens (chunk_id, token, tf) VALUES (@cid, @t, @tf)";
sqliteCommand3.Parameters.AddWithValue("@cid", num3);
sqliteCommand3.Parameters.AddWithValue("@t", value);
sqliteCommand3.Parameters.AddWithValue("@tf", num5);
sqliteCommand3.ExecuteNonQuery();
}
}
sqliteTransaction.Commit();
}
catch
{
}
}
private void RebuildDocFreq()
{
using SqliteCommand sqliteCommand = _db.CreateCommand();
sqliteCommand.CommandText = " DELETE FROM doc_freq;\n INSERT INTO doc_freq (token, df)\n SELECT token, COUNT(DISTINCT chunk_id) FROM tokens GROUP BY token;";
sqliteCommand.ExecuteNonQuery();
}
private int GetTotalChunkCount()
{
using SqliteCommand sqliteCommand = _db.CreateCommand();
sqliteCommand.CommandText = "SELECT COUNT(*) FROM chunks";
return Convert.ToInt32(sqliteCommand.ExecuteScalar());
}
private void SaveMeta(string key, string value)
{
using SqliteCommand sqliteCommand = _db.CreateCommand();
sqliteCommand.CommandText = "INSERT OR REPLACE INTO meta (key, value) VALUES (@k, @v)";
sqliteCommand.Parameters.AddWithValue("@k", key);
sqliteCommand.Parameters.AddWithValue("@v", value);
sqliteCommand.ExecuteNonQuery();
}
public List<SearchResult> Search(string query, int maxResults = 5)
{
if (!_indexed || _db == null || _totalDocs == 0)
{
return new List<SearchResult>();
}
Dictionary<string, int> dictionary = Tokenize(query);
if (dictionary.Count == 0)
{
return new List<SearchResult>();
}
Dictionary<string, int> dictionary2 = new Dictionary<string, int>();
foreach (string key in dictionary.Keys)
{
using SqliteCommand sqliteCommand = _db.CreateCommand();
sqliteCommand.CommandText = "SELECT df FROM doc_freq WHERE token = @t";
sqliteCommand.Parameters.AddWithValue("@t", key);
object obj = sqliteCommand.ExecuteScalar();
if (obj != null)
{
dictionary2[key] = Convert.ToInt32(obj);
}
}
HashSet<int> hashSet = new HashSet<int>();
foreach (string key2 in dictionary.Keys)
{
using SqliteCommand sqliteCommand2 = _db.CreateCommand();
sqliteCommand2.CommandText = "SELECT DISTINCT chunk_id FROM tokens WHERE token = @t";
sqliteCommand2.Parameters.AddWithValue("@t", key2);
using SqliteDataReader sqliteDataReader = sqliteCommand2.ExecuteReader();
while (sqliteDataReader.Read())
{
hashSet.Add(sqliteDataReader.GetInt32(0));
}
}
if (hashSet.Count == 0)
{
return new List<SearchResult>();
}
List<(int, double)> list = new List<(int, double)>();
foreach (int item in hashSet)
{
Dictionary<string, int> dictionary3 = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
using (SqliteCommand sqliteCommand3 = _db.CreateCommand())
{
sqliteCommand3.CommandText = "SELECT token, tf FROM tokens WHERE chunk_id = @cid";
sqliteCommand3.Parameters.AddWithValue("@cid", item);
using SqliteDataReader sqliteDataReader2 = sqliteCommand3.ExecuteReader();
while (sqliteDataReader2.Read())
{
dictionary3[sqliteDataReader2.GetString(0)] = sqliteDataReader2.GetInt32(1);
}
}
double num = ComputeTfIdfSimilarity(dictionary, dictionary3, dictionary2);
if (num > 0.01)
{
list.Add((item, num));
}
}
List<(int, double)> list2 = list.OrderByDescending<(int, double), double>(((int ChunkId, double Score) s) => s.Score).Take(maxResults).ToList();
List<SearchResult> list3 = new List<SearchResult>();
foreach (var (num2, score) in list2)
{
using SqliteCommand sqliteCommand4 = _db.CreateCommand();
sqliteCommand4.CommandText = " SELECT f.path, c.start_line, c.end_line, c.content\n FROM chunks c JOIN files f ON c.file_id = f.id\n WHERE c.id = @cid";
sqliteCommand4.Parameters.AddWithValue("@cid", num2);
using SqliteDataReader sqliteDataReader3 = sqliteCommand4.ExecuteReader();
if (sqliteDataReader3.Read())
{
SearchResult obj2 = new SearchResult
{
FilePath = sqliteDataReader3.GetString(0),
StartLine = sqliteDataReader3.GetInt32(1),
EndLine = sqliteDataReader3.GetInt32(2),
Score = score
};
string text = sqliteDataReader3.GetString(3);
obj2.Preview = ((text != null && text.Length > 200) ? (text.Substring(0, 200) + "...") : sqliteDataReader3.GetString(3));
list3.Add(obj2);
}
}
return list3;
}
public void TryLoadExisting(string workFolder)
{
if (string.IsNullOrEmpty(workFolder) || !Directory.Exists(workFolder))
{
return;
}
string dbPath = GetDbPath(workFolder);
if (File.Exists(dbPath))
{
EnsureDb(workFolder);
_totalDocs = GetTotalChunkCount();
_indexed = _totalDocs > 0;
if (_indexed)
{
LogService.Info($"기존 코드 인덱스 로드: {_totalDocs}개 청크 [{workFolder}]");
}
}
}
private double ComputeTfIdfSimilarity(Dictionary<string, int> queryTf, Dictionary<string, int> docTf, Dictionary<string, int> dfMap)
{
double num = 0.0;
double num2 = 0.0;
double num3 = 0.0;
string key;
int value;
foreach (KeyValuePair<string, int> item in queryTf)
{
item.Deconstruct(out key, out value);
string key2 = key;
int num4 = value;
int valueOrDefault = dfMap.GetValueOrDefault(key2, 0);
double num5 = Math.Log(1.0 + (double)_totalDocs / (1.0 + (double)valueOrDefault));
double num6 = (double)num4 * num5;
num2 += num6 * num6;
if (docTf.TryGetValue(key2, out var value2))
{
double num7 = (double)value2 * num5;
num += num6 * num7;
}
}
foreach (KeyValuePair<string, int> item2 in docTf)
{
item2.Deconstruct(out key, out value);
string key3 = key;
int num8 = value;
int valueOrDefault2 = dfMap.GetValueOrDefault(key3, 0);
double num9 = Math.Log(1.0 + (double)_totalDocs / (1.0 + (double)valueOrDefault2));
double num10 = (double)num8 * num9;
num3 += num10 * num10;
}
if (num2 == 0.0 || num3 == 0.0)
{
return 0.0;
}
return num / (Math.Sqrt(num2) * Math.Sqrt(num3));
}
private static Dictionary<string, int> Tokenize(string text)
{
Dictionary<string, int> dictionary = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
IEnumerable<string> enumerable = from w in Regex.Split(text, "[^a-zA-Z0-9가-힣_]+").SelectMany(SplitCamelCase)
where w.Length >= 2 && !StopWords.Contains(w)
select w;
foreach (string item in enumerable)
{
string key = item.ToLowerInvariant();
dictionary.TryGetValue(key, out var value);
dictionary[key] = value + 1;
}
List<string> list = enumerable.Select((string w) => w.ToLowerInvariant()).ToList();
for (int num = 0; num < list.Count - 1; num++)
{
string key2 = list[num] + "_" + list[num + 1];
dictionary.TryGetValue(key2, out var value2);
dictionary[key2] = value2 + 1;
}
return dictionary;
}
private static IEnumerable<string> SplitCamelCase(string word)
{
if (string.IsNullOrEmpty(word))
{
yield break;
}
StringBuilder sb = new StringBuilder();
foreach (char ch in word)
{
if (char.IsUpper(ch) && sb.Length > 0)
{
yield return sb.ToString();
sb.Clear();
}
sb.Append(ch);
}
if (sb.Length > 0)
{
yield return sb.ToString();
}
}
public void Dispose()
{
_db?.Dispose();
_db = null;
}
}