实现html和word的相互转换(带图片)
发布日期:2021-05-14 06:37:48 浏览次数:27 分类:精选文章

本文共 9981 字,大约阅读时间需要 33 分钟。

HTML与Word格式转换解决方案

项目背景

项目后端使用了Spring Boot和Maven,前端使用了CKEditor富文本编辑器。目前从HTML转换为Word时,发现了几个问题:

  • 格式不固定:部分页面的HTML格式不固定,导致Word导出时样式混乱。
  • 图片处理问题:支持的格式为DOCX,但需要手动将DOC转换为DOCX后才能进行图片替换,增加了操作复杂度。
  • 多年积累的经验:经过多次尝试,发现直接从HTML生成Word文档存在诸多问题,尤其是多图片转换时容易出错。
  • 依赖管理

    为实现Word与HTML的转换,需要引入以下依赖:

    • POI(Apache POI):用于处理Word文档的操作。
    • Jsoup:用于解析和操作HTML。
    • XDocReport:用于生成Word文档。
    org.apache.poi
    poi
    3.14.0
    org.apache.poi
    poi-scratchpad
    3.14.0
    org.apache.poi
    poi-ooxml
    3.14.0
    fr.opensagres.xdocreport
    xdocreport
    1.0.6
    org.apache.poi
    poi-ooxml-schemas
    3.14.0
    org.apache.poi
    ooxml-schemas
    1.3.0
    org.jsoup
    jsoup
    1.11.3

    HTML转Word

    1. Word转HTML

    将Word文档导出为HTML的实现:

    public static String docToHtml() throws Exception {
    File path = new File(ResourceUtils.getURL("classpath:").getPath());
    String imagePathStr = path.getAbsolutePath() + "\\static\\image\\";
    String sourceFileName = path.getAbsolutePath() + "\\static\\test.doc";
    String targetFileName = path.getAbsolutePath() + "\\static\\test.html";
    File file = new File(imagePathStr);
    if (!file.exists()) {
    file.mkdirs();
    }
    HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(sourceFileName));
    org.w3c.dom.Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
    wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
    try (FileOutputStream out = new FileOutputStream(imagePathStr + name)) {
    out.write(content);
    } catch (Exception e) {
    e.printStackTrace();
    }
    return "image/" + name;
    });
    wordToHtmlConverter.processDocument(wordDocument);
    org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
    DOMSource domSource = new DOMSource(htmlDocument);
    StreamResult streamResult = new StreamResult(new File(targetFileName));
    TransformerFactory tf = TransformerFactory.newInstance();
    Transformer serializer = tf.newTransformer();
    serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    serializer.setOutputProperty(OutputKeys.METHOD, "html");
    serializer.transform(domSource, streamResult);
    return targetFileName;
    }

    2. DOCX转HTML

    将Word文档导出为HTML的实现:

    public static String docxToHtml() throws Exception {
    File path = new File(ResourceUtils.getURL("classpath:").getPath());
    String imagePath = path.getAbsolutePath() + "\\static\\image";
    String sourceFileName = path.getAbsolutePath() + "\\static\\test.docx";
    String targetFileName = path.getAbsolutePath() + "\\static\\test.html";
    OutputStreamWriter outputStreamWriter = null;
    try {
    XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFileName));
    XHTMLOptions options = XHTMLOptions.create();
    options.setExtractor(new FileImageExtractor(new File(imagePath)));
    options.URIResolver(new BasicURIResolver("image"));
    outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "utf-8");
    XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
    xhtmlConverter.convert(document, outputStreamWriter, options);
    } finally {
    if (outputStreamWriter != null) {
    outputStreamWriter.close();
    }
    }
    return targetFileName;
    }

    3. HTML转Word

    实现思路:

  • 提取HTML中的图片信息,替换为占位符。
  • 生成Word文档,使用占位符替换图片。
  • 处理文本样式,确保Word文档的格式与HTML一致。
  • public static String writeWordFile(String content) {
    String path = "D:/wordFile";
    Map
    param = new HashMap<>();
    if (!"".equals(path)) {
    File fileDir = new File(path);
    if (!fileDir.exists()) {
    fileDir.mkdirs();
    }
    content = HtmlUtils.htmlUnescape(content);
    List
    > imgs = getImgStr(content);
    int count = 0;
    for (Map
    img : imgs) {
    count++;
    content = content.replace(img.get("img"), "${imgReplace" + count + "}");
    content = content.replace(img.get("img1"), "${imgReplace" + count + "}");
    Map
    header = new HashMap<>();
    header.put("width", img.get("width"));
    header.put("height", img.get("height"));
    header.put("type", "jpg");
    header.put("content", OfficeUtil.inputStream2ByteArray(new FileInputStream(imagePath), true));
    param.put("${imgReplace" + count + "}", header);
    }
    try {
    POIFSFileSystem poifs = new POIFSFileSystem();
    DirectoryEntry directory = poifs.getRoot();
    DocumentEntry documentEntry = directory.createDocument("WordDocument", new ByteArrayInputStream(content.getBytes("UTF-8")));
    FileOutputStream ostream = new FileOutputStream("D:\\wordFile\\temp.doc");
    poifs.writeFilesystem(ostream);
    byte[] bytes = content.getBytes("UTF-8");
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    POIFSFileSystem poifsDoc = new POIFSFileSystem();
    DirectoryEntry directoryDoc = poifsDoc.getRoot();
    DocumentEntry documentEntryDoc = directoryDoc.createDocument("WordDocument", bais);
    FileOutputStream fopts = new FileOutputStream("D:\\wordFile\\final.docx");
    CustomXWPFDocument doc = OfficeUtil.generateWord(param, "D:\\wordFile\\temp.docx");
    doc.write(fopts);
    fopts.close();
    } catch (Exception e) {
    e.printStackTrace();
    }
    return "D:/wordFile/final.docx";
    }
    }

    前端实现

    1. 技术选择

    选择使用CKEditor作为富文本编辑器,结合File-Saver库实现文件导出。这种方式具有以下优势:

    • 灵活性:前端可以直接处理图片和样式。
    • 可扩展性:支持表格、图表等复杂元素。
    • 跨平台:适用于Vue和Angular等主流框架。

    2. 实现细节

    1. 后端接口

    @PostMapping("/article/htmlFormat")
    public Ret html(String html) throws Exception {
    return articleService.formatHtmlStyle(html);
    }

    2. 服务逻辑

    public Ret formatHtmlStyle(String html) throws Exception {
    JSONArray picsArr = new JSONArray();
    Document doc = Jsoup.parse(html);
    Elements elementsP = doc.getElementsByTag("p");
    for (int i = 0; i < elementsP.size(); i++) {
    Element element = elementsP.get(i);
    boolean hasImg = false;
    Elements elementsChildren = element.children();
    for (int j = 0; j < elementsChildren.size(); j++) {
    Element elementChild = elementsChildren.get(j);
    if (elementChild.nodeName().equals("img")) {
    hasImg = true;
    break;
    }
    }
    if (hasImg) {
    element.attr("style", "text-align: center;");
    } else {
    element.attr("style", "font-family: FangSong_GB2312; font-size:18px; text-indent: 2em; line-height:34px; text-align:justify;");
    }
    }
    Elements elements = doc.getElementsByTag("img");
    for (int i = 0; i < elements.size(); i++) {
    Element element = elements.get(i);
    String src = element.attr("src");
    JSONObject picjo = new JSONObject();
    picjo.put("index", i);
    picjo.put("src", CommonUtil.urlToBase64(src));
    picsArr.add(picjo);
    element.attr("src", "" + i);
    }
    Ret ret = Ret.create().setCodeAndMsg(200).set("html", doc.body().html()).set("pics", picsArr);
    return ret;
    }

    3. 前端实现

    1. 安装依赖
    npm install file-saver --save
    bower install file-saver
    2. 导出逻辑
    import { saveAs } from 'file-saver';
    downloadArticle: function() {
    var FileSaver = require('file-saver');
    var html = "这里是你的html内容";
    this.$axios({
    method: 'post',
    url: '/article/htmlFormat',
    data: new URLSearchParams({
    html: html
    })
    }).then((response) => {
    const _static = {
    mhtml: {
    top: "Mime-Version: 1.0\nContent-Base: " + location.href + "\nContent-Type: Multipart/related; boundary=\"NEXT.ITEM-BOUNDARY\";type=\"text/html\"\n\n--NEXT.ITEM-BOUNDARY\nContent-Type: text/html; charset=\"utf-8\"\nContent-Location: " + location.href + "\n\n\n\n_html_",
    head: "\n
    \n\n\n",
    body: "_body_"
    }
    };
    const dealhtml = response.data.html;
    const img = response.data.pics;
    let mhtmlBottom = "\n";
    for (let i = 0; i < img.length; i++) {
    const uri = img[i].src;
    const index = img[i].index;
    mhtmlBottom += "--NEXT.ITEM-BOUNDARY\n";
    mhtmlBottom += "Content-Location: " + index + "\n";
    mhtmlBottom += "Content-Type: " + uri.substring(uri.indexOf(":") + 1, uri.indexOf(";")) + "\n";
    mhtmlBottom += "Content-Transfer-Encoding: " + uri.substring(uri.indexOf(";") + 1, uri.indexOf(",")) + "\n\n";
    mhtmlBottom += uri.substring(uri.indexOf(",") + 1) + "\n\n";
    }
    mhtmlBottom += "--NEXT.ITEM-BOUNDARY--";
    const fileContent = _static.mhtml.top.replace("_html_", _static.mhtml.body.replace("_body_", dealhtml)) + mhtmlBottom;
    const blob = new Blob([fileContent], {
    type: "application/msword;charset=utf-8"
    });
    saveAs(blob, `testImage.doc`);
    });
    }
    4. 注意事项
  • 图片处理:在后端返回的图片路径中,图片应为Base64格式,避免在Word中显示异常。
  • 样式处理:确保生成的Word文档样式与HTML一致,避免格式混乱。
  • 兼容性:测试不同浏览器和Word版本,确保稳定性。

  • 总结

    通过上述方法,可以实现HTML与Word格式的无缝转换。前端负责图片和样式处理,后端负责文本内容转换和格式生成。这种方案既降低了后端的工作强度,又保证了前端的灵活性,适用于复杂的文档转换场景。

    上一篇:使用AOP给springboot项目添加日志
    下一篇:Kettle7.0实现主键,索引迁移

    发表评论

    最新留言

    不错!
    [***.144.177.141]2025年04月18日 16时52分56秒