Files
AX-Copilot-Codex/src/AxCopilot/Services/Agent/DocumentReaderTool.cs

572 lines
23 KiB
C#

using System.IO;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using UglyToad.PdfPig;
namespace AxCopilot.Services.Agent;
/// <summary>
/// 문서 파일을 읽어 텍스트로 반환하는 도구.
/// PDF, DOCX, XLSX, CSV, TXT, BibTeX, RIS 등 다양한 형식을 지원합니다.
/// </summary>
public class DocumentReaderTool : IAgentTool
{
public string Name => "document_read";
public string Description =>
"Read a document file and extract its text content. " +
"Supports: PDF (.pdf), Word (.docx), Excel (.xlsx), CSV (.csv), text (.txt/.log/.json/.xml/.md), " +
"BibTeX (.bib), RIS (.ris). " +
"For large files, use 'offset' to read from a specific character position (chunked reading). " +
"For large PDFs, use 'pages' parameter to read specific page ranges (e.g., '1-5', '10-20'). " +
"Use 'section' parameter with value 'references' to extract only the references/bibliography section from a PDF.";
public ToolParameterSchema Parameters => new()
{
Properties = new()
{
["path"] = new() { Type = "string", Description = "Document file path (absolute or relative to work folder)" },
["max_chars"] = new() { Type = "integer", Description = "Maximum characters to extract per chunk. Default: 8000. Use smaller values for summaries." },
["offset"] = new() { Type = "integer", Description = "Character offset to start reading from. Default: 0. Use this to read the next chunk of a large file (value from 'next_offset' in previous response)." },
["sheet"] = new() { Type = "string", Description = "For Excel files: sheet name or 1-based index. Default: first sheet." },
["pages"] = new() { Type = "string", Description = "For PDF files: page range to read (e.g., '1-5', '3', '10-20'). Default: all pages." },
["section"] = new() { Type = "string", Description = "Extract specific section. 'references' = extract references/bibliography from PDF. 'abstract' = extract abstract." },
},
Required = ["path"]
};
private static readonly HashSet<string> TextExtensions = new(StringComparer.OrdinalIgnoreCase)
{
".txt", ".log", ".json", ".xml", ".md", ".csv", ".tsv",
".yaml", ".yml", ".ini", ".cfg", ".conf", ".properties",
".html", ".htm", ".css", ".js", ".ts", ".py", ".cs", ".java",
".sql", ".sh", ".bat", ".ps1", ".r", ".m",
};
private const int DefaultMaxChars = 8000;
public async Task<ToolResult> ExecuteAsync(JsonElement args, AgentContext context, CancellationToken ct)
{
if (!args.TryGetProperty("path", out var pathEl))
return ToolResult.Fail("path가 필요합니다.");
var path = pathEl.GetString() ?? "";
var maxChars = args.TryGetProperty("max_chars", out var mc) ? GetIntValue(mc, DefaultMaxChars) : DefaultMaxChars;
var offset = args.TryGetProperty("offset", out var off) ? GetIntValue(off, 0) : 0;
var sheetParam = args.TryGetProperty("sheet", out var sh) ? sh.GetString() ?? "" : "";
var pagesParam = args.TryGetProperty("pages", out var pg) ? pg.GetString() ?? "" : "";
var sectionParam = args.TryGetProperty("section", out var sec) ? sec.GetString() ?? "" : "";
if (maxChars < 100) maxChars = DefaultMaxChars;
if (offset < 0) offset = 0;
var fullPath = FileReadTool.ResolvePath(path, context.WorkFolder);
if (!context.IsPathAllowed(fullPath))
return ToolResult.Fail($"경로 접근 차단: {fullPath}");
if (!File.Exists(fullPath))
return ToolResult.Fail($"파일이 존재하지 않습니다: {fullPath}");
var ext = Path.GetExtension(fullPath).ToLowerInvariant();
try
{
// 전체 텍스트 추출 (offset > 0이면 전체를 추출해서 잘라야 함)
var extractMax = offset > 0 ? offset + maxChars + 100 : maxChars;
var text = ext switch
{
".pdf" => await Task.Run(() => ReadPdf(fullPath, extractMax, pagesParam, sectionParam), ct),
".docx" => await Task.Run(() => ReadDocx(fullPath, extractMax), ct),
".xlsx" => await Task.Run(() => ReadXlsx(fullPath, sheetParam, extractMax), ct),
".bib" => await Task.Run(() => ReadBibTeX(fullPath, extractMax), ct),
".ris" => await Task.Run(() => ReadRis(fullPath, extractMax), ct),
_ when TextExtensions.Contains(ext) => await ReadTextFile(fullPath, extractMax, ct),
_ => null,
};
if (text == null)
return ToolResult.Fail($"지원하지 않는 파일 형식: {ext}");
var totalExtracted = text.Length;
// offset 적용 — 청크 분할 읽기
if (offset > 0)
{
if (offset >= text.Length)
return ToolResult.Ok($"[{Path.GetFileName(fullPath)}] offset {offset}은 문서 끝을 초과합니다 (전체 {text.Length}자).", fullPath);
text = text[offset..];
}
// maxChars 자르기
var hasMore = text.Length > maxChars;
if (hasMore)
text = text[..maxChars];
var fileInfo = new FileInfo(fullPath);
var header = $"[{Path.GetFileName(fullPath)}] ({ext.TrimStart('.')}, {FormatSize(fileInfo.Length)})";
if (offset > 0)
header += $" — offset {offset}부터 {maxChars}자 읽음";
if (hasMore)
{
var nextOffset = offset + maxChars;
header += $"\n⚡ 추가 내용이 있습니다. 다음 청크를 읽으려면 offset={nextOffset}을 사용하세요.";
}
else if (offset > 0)
{
header += " — 문서 끝까지 읽음 ✓";
}
else if (totalExtracted >= maxChars)
{
header += $" — 처음 {maxChars}자만 추출됨. 계속 읽으려면 offset={maxChars}을 사용하거나, pages 파라미터로 특정 페이지를 지정하세요.";
}
return ToolResult.Ok($"{header}\n\n{text}", fullPath);
}
catch (Exception ex)
{
return ToolResult.Fail($"문서 읽기 실패: {ex.Message}");
}
}
// ─── PDF ────────────────────────────────────────────────────────────────
private static string ReadPdf(string path, int maxChars, string pagesParam, string sectionParam)
{
var sb = new StringBuilder();
using var doc = PdfDocument.Open(path);
var totalPages = doc.NumberOfPages;
sb.AppendLine($"PDF: {totalPages}페이지");
sb.AppendLine();
// 페이지 범위 파싱
var (startPage, endPage) = ParsePageRange(pagesParam, totalPages);
// 섹션 추출 모드
if (string.Equals(sectionParam, "references", StringComparison.OrdinalIgnoreCase))
return ExtractReferences(doc, totalPages, maxChars);
if (string.Equals(sectionParam, "abstract", StringComparison.OrdinalIgnoreCase))
return ExtractAbstract(doc, totalPages, maxChars);
sb.AppendLine($"읽는 범위: {startPage}-{endPage} / {totalPages} 페이지");
sb.AppendLine();
for (int i = startPage; i <= endPage && sb.Length < maxChars; i++)
{
var page = doc.GetPage(i);
var pageText = page.Text;
if (!string.IsNullOrWhiteSpace(pageText))
{
sb.AppendLine($"--- Page {i} ---");
sb.AppendLine(pageText.Trim());
sb.AppendLine();
}
}
return Truncate(sb.ToString(), maxChars);
}
private static (int start, int end) ParsePageRange(string pagesParam, int totalPages)
{
if (string.IsNullOrWhiteSpace(pagesParam))
return (1, totalPages);
// "5" → page 5 only
if (int.TryParse(pagesParam.Trim(), out var single))
return (Math.Max(1, single), Math.Min(single, totalPages));
// "3-10" → pages 3 to 10
var parts = pagesParam.Split('-', StringSplitOptions.TrimEntries);
if (parts.Length == 2 &&
int.TryParse(parts[0], out var s) &&
int.TryParse(parts[1], out var e))
{
return (Math.Max(1, s), Math.Min(e, totalPages));
}
return (1, totalPages);
}
/// <summary>PDF에서 References/Bibliography 섹션을 추출합니다.</summary>
private static string ExtractReferences(PdfDocument doc, int totalPages, int maxChars)
{
var sb = new StringBuilder();
sb.AppendLine("=== References / Bibliography ===");
sb.AppendLine();
// 뒤에서부터 References 섹션 시작점을 찾습니다
var refPatterns = new[]
{
@"(?i)^\s*(References|Bibliography|Works\s+Cited|Literature\s+Cited|참고\s*문헌|참조|인용\s*문헌)\s*$",
@"(?i)^(References|Bibliography|참고문헌)\s*\n",
};
bool found = false;
for (int i = totalPages; i >= Math.Max(1, totalPages - 10) && !found; i--)
{
var pageText = doc.GetPage(i).Text;
if (string.IsNullOrWhiteSpace(pageText)) continue;
foreach (var pattern in refPatterns)
{
var match = Regex.Match(pageText, pattern, RegexOptions.Multiline);
if (match.Success)
{
// References 시작 지점부터 끝까지 추출
var refStart = match.Index;
sb.AppendLine($"(Page {i}부터 시작)");
sb.AppendLine(pageText[refStart..].Trim());
// 이후 페이지도 포함
for (int j = i + 1; j <= totalPages && sb.Length < maxChars; j++)
{
var nextText = doc.GetPage(j).Text;
if (!string.IsNullOrWhiteSpace(nextText))
sb.AppendLine(nextText.Trim());
}
found = true;
break;
}
}
}
if (!found)
{
// References 헤더를 못 찾으면 마지막 3페이지를 반환
sb.AppendLine("(References 섹션 헤더를 찾지 못했습니다. 마지막 3페이지를 반환합니다.)");
sb.AppendLine();
for (int i = Math.Max(1, totalPages - 2); i <= totalPages && sb.Length < maxChars; i++)
{
var pageText = doc.GetPage(i).Text;
if (!string.IsNullOrWhiteSpace(pageText))
{
sb.AppendLine($"--- Page {i} ---");
sb.AppendLine(pageText.Trim());
sb.AppendLine();
}
}
}
// 개별 참고문헌 항목 파싱 시도
var rawRefs = sb.ToString();
var parsed = ParseReferenceEntries(rawRefs);
if (parsed.Count > 0)
{
var result = new StringBuilder();
result.AppendLine($"=== References ({parsed.Count}개 항목) ===\n");
for (int i = 0; i < parsed.Count; i++)
{
result.AppendLine($"[{i + 1}] {parsed[i]}");
}
return Truncate(result.ToString(), maxChars);
}
return Truncate(rawRefs, maxChars);
}
/// <summary>PDF에서 Abstract 섹션을 추출합니다.</summary>
private static string ExtractAbstract(PdfDocument doc, int totalPages, int maxChars)
{
var sb = new StringBuilder();
sb.AppendLine("=== Abstract ===");
sb.AppendLine();
// 첫 3페이지에서 Abstract 찾기
for (int i = 1; i <= Math.Min(3, totalPages); i++)
{
var pageText = doc.GetPage(i).Text;
if (string.IsNullOrWhiteSpace(pageText)) continue;
var match = Regex.Match(pageText,
@"(?i)(Abstract|초록|요약)\s*\n(.*?)(?=\n\s*(Keywords|Introduction|1\.|서론|키워드|핵심어)\s*[\n:])",
RegexOptions.Singleline);
if (match.Success)
{
sb.AppendLine(match.Groups[2].Value.Trim());
return Truncate(sb.ToString(), maxChars);
}
}
// 찾지 못하면 첫 페이지 반환
sb.AppendLine("(Abstract 섹션을 찾지 못했습니다. 첫 페이지를 반환합니다.)");
var firstPage = doc.GetPage(1).Text;
if (!string.IsNullOrWhiteSpace(firstPage))
sb.AppendLine(firstPage.Trim());
return Truncate(sb.ToString(), maxChars);
}
/// <summary>참고문헌 텍스트에서 개별 항목을 파싱합니다.</summary>
private static List<string> ParseReferenceEntries(string text)
{
var entries = new List<string>();
// [1], [2] 형태의 번호 매기기
var numbered = Regex.Split(text, @"\n\s*\[(\d+)\]\s*");
if (numbered.Length > 3)
{
for (int i = 2; i < numbered.Length; i += 2)
{
var entry = numbered[i].Trim().Replace("\n", " ").Replace(" ", " ");
if (entry.Length > 10)
entries.Add(entry);
}
return entries;
}
// 1. 2. 3. 형태
var dotNumbered = Regex.Split(text, @"\n\s*(\d+)\.\s+");
if (dotNumbered.Length > 5)
{
for (int i = 2; i < dotNumbered.Length; i += 2)
{
var entry = dotNumbered[i].Trim().Replace("\n", " ").Replace(" ", " ");
if (entry.Length > 10)
entries.Add(entry);
}
return entries;
}
return entries;
}
// ─── BibTeX ─────────────────────────────────────────────────────────────
private static string ReadBibTeX(string path, int maxChars)
{
var content = TextFileCodec.ReadAllText(path).Text;
var sb = new StringBuilder();
var entryPattern = new Regex(
@"@(\w+)\s*\{\s*([^,\s]+)\s*,\s*(.*?)\n\s*\}",
RegexOptions.Singleline);
var fieldPattern = new Regex(
@"(\w+)\s*=\s*[\{""](.*?)[\}""]",
RegexOptions.Singleline);
var matches = entryPattern.Matches(content);
sb.AppendLine($"BibTeX: {matches.Count}개 항목");
sb.AppendLine();
int idx = 0;
foreach (Match m in matches)
{
if (sb.Length >= maxChars) break;
idx++;
var entryType = m.Groups[1].Value;
var citeKey = m.Groups[2].Value;
var body = m.Groups[3].Value;
sb.AppendLine($"[{idx}] @{entryType}{{{citeKey}}}");
var fields = fieldPattern.Matches(body);
foreach (Match f in fields)
{
var fieldName = f.Groups[1].Value.ToLower();
var fieldValue = f.Groups[2].Value.Trim();
// 핵심 필드만 표시
if (fieldName is "author" or "title" or "journal" or "booktitle"
or "year" or "volume" or "number" or "pages" or "doi"
or "publisher" or "url")
{
sb.AppendLine($" {fieldName}: {fieldValue}");
}
}
sb.AppendLine();
}
if (matches.Count == 0)
{
sb.AppendLine("(BibTeX 항목을 파싱하지 못했습니다. 원문을 반환합니다.)");
sb.AppendLine(Truncate(content, maxChars - sb.Length));
}
return Truncate(sb.ToString(), maxChars);
}
// ─── RIS ────────────────────────────────────────────────────────────────
private static string ReadRis(string path, int maxChars)
{
var lines = TextFileCodec.SplitLines(TextFileCodec.ReadAllText(path).Text);
var sb = new StringBuilder();
var entries = new List<Dictionary<string, List<string>>>();
Dictionary<string, List<string>>? current = null;
foreach (var line in lines)
{
if (line.StartsWith("TY -"))
{
current = new Dictionary<string, List<string>>();
entries.Add(current);
}
else if (line.StartsWith("ER -"))
{
current = null;
continue;
}
if (current != null && line.Length >= 6 && line[2] == ' ' && line[3] == ' ' && line[4] == '-' && line[5] == ' ')
{
var tag = line[..2].Trim();
var value = line[6..].Trim();
if (!current.ContainsKey(tag))
current[tag] = new List<string>();
current[tag].Add(value);
}
}
sb.AppendLine($"RIS: {entries.Count}개 항목");
sb.AppendLine();
// RIS 태그 → 사람이 읽을 수 있는 이름
var tagNames = new Dictionary<string, string>
{
["TY"] = "Type", ["AU"] = "Author", ["TI"] = "Title", ["T1"] = "Title",
["JO"] = "Journal", ["JF"] = "Journal", ["PY"] = "Year", ["Y1"] = "Year",
["VL"] = "Volume", ["IS"] = "Issue", ["SP"] = "Start Page", ["EP"] = "End Page",
["DO"] = "DOI", ["UR"] = "URL", ["PB"] = "Publisher", ["AB"] = "Abstract",
["KW"] = "Keyword", ["SN"] = "ISSN/ISBN",
};
for (int i = 0; i < entries.Count && sb.Length < maxChars; i++)
{
sb.AppendLine($"[{i + 1}]");
var entry = entries[i];
foreach (var (tag, values) in entry)
{
var label = tagNames.GetValueOrDefault(tag, tag);
if (tag is "AU" or "KW")
sb.AppendLine($" {label}: {string.Join("; ", values)}");
else
sb.AppendLine($" {label}: {string.Join(" ", values)}");
}
sb.AppendLine();
}
return Truncate(sb.ToString(), maxChars);
}
// ─── DOCX ───────────────────────────────────────────────────────────────
private static string ReadDocx(string path, int maxChars)
{
var sb = new StringBuilder();
using var doc = WordprocessingDocument.Open(path, false);
var body = doc.MainDocumentPart?.Document.Body;
if (body == null) return "(빈 문서)";
foreach (var para in body.Elements<DocumentFormat.OpenXml.Wordprocessing.Paragraph>())
{
var text = para.InnerText;
if (!string.IsNullOrWhiteSpace(text))
{
sb.AppendLine(text);
if (sb.Length >= maxChars) break;
}
}
return Truncate(sb.ToString(), maxChars);
}
// ─── XLSX ───────────────────────────────────────────────────────────────
private static string ReadXlsx(string path, string sheetParam, int maxChars)
{
var sb = new StringBuilder();
using var doc = SpreadsheetDocument.Open(path, false);
var workbook = doc.WorkbookPart;
if (workbook == null) return "(빈 스프레드시트)";
var sheets = workbook.Workbook.Sheets?.Elements<Sheet>().ToList() ?? [];
if (sheets.Count == 0) return "(시트 없음)";
sb.AppendLine($"Excel: {sheets.Count}개 시트 ({string.Join(", ", sheets.Select(s => s.Name?.Value))})");
sb.AppendLine();
Sheet? targetSheet = null;
if (!string.IsNullOrEmpty(sheetParam))
{
if (int.TryParse(sheetParam, out var idx) && idx >= 1 && idx <= sheets.Count)
targetSheet = sheets[idx - 1];
else
targetSheet = sheets.FirstOrDefault(s =>
string.Equals(s.Name?.Value, sheetParam, StringComparison.OrdinalIgnoreCase));
}
targetSheet ??= sheets[0];
var sheetId = targetSheet.Id?.Value;
if (sheetId == null) return "(시트 ID 없음)";
var wsPart = (WorksheetPart)workbook.GetPartById(sheetId);
var sharedStrings = workbook.SharedStringTablePart?.SharedStringTable
.Elements<SharedStringItem>().ToList() ?? [];
var rows = wsPart.Worksheet.Descendants<Row>().ToList();
sb.AppendLine($"[{targetSheet.Name?.Value}] ({rows.Count} rows)");
foreach (var row in rows)
{
var cells = row.Elements<Cell>().ToList();
var values = new List<string>();
foreach (var cell in cells)
values.Add(GetCellValue(cell, sharedStrings));
sb.AppendLine(string.Join("\t", values));
if (sb.Length >= maxChars) break;
}
return Truncate(sb.ToString(), maxChars);
}
private static string GetCellValue(Cell cell, List<SharedStringItem> sharedStrings)
{
var value = cell.CellValue?.Text ?? "";
if (cell.DataType?.Value == CellValues.SharedString)
{
if (int.TryParse(value, out var idx) && idx >= 0 && idx < sharedStrings.Count)
return sharedStrings[idx].InnerText;
}
return value;
}
// ─── Text ───────────────────────────────────────────────────────────────
private static async Task<string> ReadTextFile(string path, int maxChars, CancellationToken ct)
{
var text = (await TextFileCodec.ReadAllTextAsync(path, ct)).Text;
return Truncate(text, maxChars);
}
// ─── Helpers ────────────────────────────────────────────────────────────
private static string Truncate(string text, int maxChars)
{
if (text.Length <= maxChars) return text;
return text[..maxChars] + "\n\n... (내용 잘림 — pages 또는 section 파라미터로 특정 부분을 읽을 수 있습니다)";
}
private static string FormatSize(long bytes) => bytes switch
{
< 1024 => $"{bytes} B",
< 1024 * 1024 => $"{bytes / 1024.0:F1} KB",
_ => $"{bytes / (1024.0 * 1024.0):F1} MB",
};
/// <summary>JsonElement에서 int를 안전하게 추출합니다. string/integer 양쪽 호환.</summary>
private static int GetIntValue(JsonElement el, int defaultValue)
{
if (el.ValueKind == JsonValueKind.Number) return el.GetInt32();
if (el.ValueKind == JsonValueKind.String && int.TryParse(el.GetString(), out var v)) return v;
return defaultValue;
}
}