Skip to content

Commit 8f867fc

Browse files
committed
issues/9402 解决文档向量化,文件名中文乱码导致失败问题
1 parent 254c388 commit 8f867fc

1 file changed

Lines changed: 3 additions & 4 deletions

File tree

  • jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/document

jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/document/TikaDocumentParser.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import org.xml.sax.ContentHandler;
3131

3232
import java.io.*;
33-
import java.nio.file.Files;
3433
import java.util.Arrays;
3534
import java.util.HashSet;
3635
import java.util.List;
@@ -73,8 +72,8 @@ public TikaDocumentParser(Supplier<Parser> parserSupplier, Supplier<ContentHandl
7372
public Document parse(File file) {
7473
AssertUtils.assertNotEmpty("请选择文件", file);
7574
try {
76-
// 用于解析
77-
InputStream isForParsing = Files.newInputStream(file.toPath());
75+
// 用于解析(使用FileInputStream避免file.toPath()在Linux非UTF-8环境下中文文件名报错)
76+
InputStream isForParsing = new FileInputStream(file);
7877
// 使用 Tika 自动检测 MIME 类型
7978
String fileName = file.getName().toLowerCase();
8079
//后缀
@@ -102,7 +101,7 @@ public Document parse(File file) {
102101
*/
103102
public Document parseDocExcelPdfUsingApachePoi(File file) {
104103
AssertUtils.assertNotEmpty("请选择文件", file);
105-
try (InputStream inputStream = Files.newInputStream(file.toPath())) {
104+
try (InputStream inputStream = new FileInputStream(file)) {
106105
ApachePoiDocumentParser parser = new ApachePoiDocumentParser();
107106
Document document = parser.parse(inputStream);
108107
if (document == null || Utils.isNullOrBlank(document.text())) {

0 commit comments

Comments
 (0)