export class DoclingConverter {
constructor() {
this.simpleTagMap = {
doctag: "div",
document: "div",
ordered_list: "ol",
unordered_list: "ul",
list_item: "li",
caption: "figcaption",
footnote: "sup",
formula: "div",
page_footer: "footer",
page_header: "header",
picture: "figure",
chart: "figure",
table: "table",
otsl: "table",
text: "p",
paragraph: "p",
title: "h1",
document_index: "div",
form: "form",
key_value_region: "dl",
reference: "a",
smiles: "span",
};
this.selfClosingTagMap = {
checkbox_selected: '',
checkbox_unselected: '',
page_break: '
',
};
this.TABLE_TAG_CONFIG = {
"": { htmlTag: "th" },
"": { htmlTag: "th", scope: "row" },
"": { htmlTag: "th", scope: "row" },
"": { htmlTag: "td" },
"": { htmlTag: "td" },
"": { htmlTag: "td" },
"": { htmlTag: "td" },
"": { htmlTag: "td" },
};
this.TABLE_TAG_REGEX = new RegExp(`(${Object.keys(this.TABLE_TAG_CONFIG).join("|")})`);
const selfClosingNames = Object.keys(this.selfClosingTagMap).join("|");
this.combinedTagRegex = new RegExp(`(<([a-z_0-9]+)>(.*?)<\\/\\2>)|(<(${selfClosingNames})>)`, "s");
}
escapeHtml(text) {
if (!text) return "";
return text.replace(/&/g, "&").replace(//g, ">").replace(/"/g, """);
}
convert(docling) {
let html = ` ${docling} `;
html = this.cleanupMetadataTokens(html);
html = this.processTags(html);
return html.trim();
}
processTags(text) {
let remainingText = text;
let result = "";
while (remainingText.length > 0) {
const match = remainingText.match(this.combinedTagRegex);
if (match && typeof match.index === "number") {
const textBefore = remainingText.substring(0, match.index);
result += this.escapeHtml(textBefore);
const fullMatch = match[0];
const pairedTagName = match[2];
const pairedContent = match[3];
const selfClosingTagName = match[5];
if (pairedTagName !== undefined) {
result += this.convertSingleTag(pairedTagName, pairedContent);
} else if (selfClosingTagName !== undefined) {
result += this.selfClosingTagMap[selfClosingTagName] || "";
}
remainingText = remainingText.substring(match.index + fullMatch.length);
} else {
result += this.escapeHtml(remainingText);
break;
}
}
return result;
}
convertSingleTag(tagName, content) {
if (tagName === "list_item") {
content = content.trim().replace(/^[ยท-]\s*/g, "");
}
switch (tagName) {
case "code":
return this.convertBlockCode(content);
case "otsl":
return this.convertTable(content);
case "picture":
case "chart":
return this.convertPictureOrChart(tagName, content);
case "inline":
return this.convertInlineContent(content);
case "section_header_level_0":
case "section_header_level_1":
case "section_header_level_2":
case "section_header_level_3":
case "section_header_level_4":
case "section_header_level_5":
const level = parseInt(tagName.at(-1), 10) + 1;
return `${this.processTags(content)}`;
default:
const htmlTag = this.simpleTagMap[tagName];
if (htmlTag) {
const processedContent = this.processTags(content);
const startTag = this.getStartTag(tagName, htmlTag);
return `${startTag}${processedContent}${htmlTag}>`;
}
console.warn(`Unknown tag encountered: ${tagName}, escaping it.`);
return this.escapeHtml(`<${tagName}>${content}${tagName}>`);
}
}
getStartTag(doclingTag, htmlTag) {
switch (doclingTag) {
case "doctag":
case "document":
return '';
case "formula":
return '