132 lines
3.9 KiB
C#
132 lines
3.9 KiB
C#
using System.IO;
|
|
using System.Text;
|
|
|
|
namespace AxCopilot.Services.Agent;
|
|
|
|
/// <summary>
|
|
/// 텍스트 파일 인코딩 감지/읽기/쓰기 유틸.
|
|
/// - 읽기: BOM + UTF-8 유효성 검사 기반 자동 감지
|
|
/// - 쓰기: 기존 파일 인코딩/UTF-8 BOM 여부를 최대한 보존
|
|
/// </summary>
|
|
public static class TextFileCodec
|
|
{
|
|
public readonly record struct TextReadResult(string Text, Encoding Encoding, bool HasBom);
|
|
|
|
static TextFileCodec()
|
|
{
|
|
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
|
}
|
|
|
|
public static TextReadResult ReadAllText(string path)
|
|
{
|
|
var bytes = File.ReadAllBytes(path);
|
|
return Decode(bytes);
|
|
}
|
|
|
|
public static async Task<TextReadResult> ReadAllTextAsync(string path, CancellationToken ct = default)
|
|
{
|
|
var bytes = await File.ReadAllBytesAsync(path, ct);
|
|
return Decode(bytes);
|
|
}
|
|
|
|
public static string[] SplitLines(string text)
|
|
=> text.Split('\n');
|
|
|
|
public static Encoding ResolveWriteEncoding(Encoding sourceEncoding, bool sourceHasBom)
|
|
{
|
|
if (sourceEncoding.CodePage == Encoding.UTF8.CodePage)
|
|
return new UTF8Encoding(sourceHasBom);
|
|
return sourceEncoding;
|
|
}
|
|
|
|
public static async Task WriteAllTextAsync(string path, string content, Encoding encoding, CancellationToken ct = default)
|
|
{
|
|
var dir = Path.GetDirectoryName(path);
|
|
if (!string.IsNullOrWhiteSpace(dir))
|
|
Directory.CreateDirectory(dir);
|
|
|
|
await using var stream = new FileStream(path, FileMode.Create, FileAccess.Write, FileShare.Read, 4096, useAsync: true);
|
|
await using var writer = new StreamWriter(stream, encoding);
|
|
await writer.WriteAsync(content.AsMemory(), ct);
|
|
await writer.FlushAsync();
|
|
}
|
|
|
|
public static Encoding Utf8NoBom => new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);
|
|
|
|
private static TextReadResult Decode(byte[] bytes)
|
|
{
|
|
var detected = DetectEncoding(bytes, out var bomLength, out var hasBom);
|
|
var payload = bomLength > 0 ? bytes[bomLength..] : bytes;
|
|
var text = detected.GetString(payload);
|
|
return new TextReadResult(text, detected, hasBom);
|
|
}
|
|
|
|
private static Encoding DetectEncoding(byte[] bytes, out int bomLength, out bool hasBom)
|
|
{
|
|
// UTF-8 BOM
|
|
if (bytes.Length >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
|
|
{
|
|
bomLength = 3;
|
|
hasBom = true;
|
|
return Encoding.UTF8;
|
|
}
|
|
|
|
// UTF-16 LE BOM
|
|
if (bytes.Length >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
|
{
|
|
bomLength = 2;
|
|
hasBom = true;
|
|
return Encoding.Unicode;
|
|
}
|
|
|
|
// UTF-16 BE BOM
|
|
if (bytes.Length >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
|
{
|
|
bomLength = 2;
|
|
hasBom = true;
|
|
return Encoding.BigEndianUnicode;
|
|
}
|
|
|
|
bomLength = 0;
|
|
hasBom = false;
|
|
|
|
if (IsValidUtf8(bytes))
|
|
return new UTF8Encoding(encoderShouldEmitUTF8Identifier: false);
|
|
|
|
// 한국어 Windows 환경 호환 fallback
|
|
try { return Encoding.GetEncoding("euc-kr"); }
|
|
catch { return Encoding.Default; }
|
|
}
|
|
|
|
private static bool IsValidUtf8(byte[] bytes)
|
|
{
|
|
var i = 0;
|
|
while (i < bytes.Length)
|
|
{
|
|
if (bytes[i] <= 0x7F)
|
|
{
|
|
i++;
|
|
continue;
|
|
}
|
|
|
|
int extra;
|
|
if ((bytes[i] & 0xE0) == 0xC0) extra = 1;
|
|
else if ((bytes[i] & 0xF0) == 0xE0) extra = 2;
|
|
else if ((bytes[i] & 0xF8) == 0xF0) extra = 3;
|
|
else return false;
|
|
|
|
if (i + extra >= bytes.Length) return false;
|
|
for (var j = 1; j <= extra; j++)
|
|
{
|
|
if ((bytes[i + j] & 0xC0) != 0x80)
|
|
return false;
|
|
}
|
|
|
|
i += extra + 1;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|