From 4f2c8bd905f1b8fb0785881dd48528d26aff6350 Mon Sep 17 00:00:00 2001 From: one Date: Mon, 23 Jun 2025 15:19:21 +0800 Subject: [PATCH] fix(Markdown): improve latex brackets handling (#7358) --- package.json | 2 + .../src/pages/home/Markdown/Markdown.tsx | 6 +- .../home/Markdown/__tests__/Markdown.test.tsx | 14 +- .../src/utils/__tests__/formats.test.ts | 31 --- .../src/utils/__tests__/markdown.test.ts | 195 ++++++++++++++++++ src/renderer/src/utils/formats.ts | 18 -- src/renderer/src/utils/markdown.ts | 80 +++++++ yarn.lock | 16 ++ 8 files changed, 298 insertions(+), 64 deletions(-) diff --git a/package.json b/package.json index 7a673b8417..ee795f17ee 100644 --- a/package.json +++ b/package.json @@ -112,6 +112,7 @@ "@testing-library/jest-dom": "^6.6.3", "@testing-library/react": "^16.3.0", "@tryfabric/martian": "^1.2.4", + "@types/balanced-match": "^3", "@types/diff": "^7", "@types/fs-extra": "^11", "@types/lodash": "^4.17.5", @@ -138,6 +139,7 @@ "archiver": "^7.0.1", "async-mutex": "^0.5.0", "axios": "^1.7.3", + "balanced-match": "^3.0.1", "browser-image-compression": "^2.0.2", "color": "^5.0.0", "dayjs": "^1.11.11", diff --git a/src/renderer/src/pages/home/Markdown/Markdown.tsx b/src/renderer/src/pages/home/Markdown/Markdown.tsx index 2a6446fec7..454550c5c8 100644 --- a/src/renderer/src/pages/home/Markdown/Markdown.tsx +++ b/src/renderer/src/pages/home/Markdown/Markdown.tsx @@ -8,8 +8,8 @@ import { useSettings } from '@renderer/hooks/useSettings' import { EVENT_NAMES, EventEmitter } from '@renderer/services/EventService' import type { MainTextMessageBlock, ThinkingMessageBlock, TranslationMessageBlock } from '@renderer/types/newMessage' import { parseJSON } from '@renderer/utils' -import { escapeBrackets, removeSvgEmptyLines } from '@renderer/utils/formats' -import { findCitationInChildren, getCodeBlockId } from '@renderer/utils/markdown' +import { removeSvgEmptyLines } from '@renderer/utils/formats' +import { findCitationInChildren, getCodeBlockId, processLatexBrackets } from '@renderer/utils/markdown' import { isEmpty } from 'lodash' import { type FC, memo, useCallback, useMemo } from 'react' import { useTranslation } from 'react-i18next' @@ -52,7 +52,7 @@ const Markdown: FC = ({ block }) => { const empty = isEmpty(block.content) const paused = block.status === 'paused' const content = empty && paused ? t('message.chat.completion.paused') : block.content - return removeSvgEmptyLines(escapeBrackets(content)) + return removeSvgEmptyLines(processLatexBrackets(content)) }, [block, t]) const rehypePlugins = useMemo(() => { diff --git a/src/renderer/src/pages/home/Markdown/__tests__/Markdown.test.tsx b/src/renderer/src/pages/home/Markdown/__tests__/Markdown.test.tsx index abd7067ab0..be9b18c13b 100644 --- a/src/renderer/src/pages/home/Markdown/__tests__/Markdown.test.tsx +++ b/src/renderer/src/pages/home/Markdown/__tests__/Markdown.test.tsx @@ -42,13 +42,13 @@ vi.mock('@renderer/utils', () => ({ })) vi.mock('@renderer/utils/formats', () => ({ - escapeBrackets: vi.fn((str) => str), removeSvgEmptyLines: vi.fn((str) => str) })) vi.mock('@renderer/utils/markdown', () => ({ findCitationInChildren: vi.fn(() => '{"id": 1, "url": "https://example.com"}'), - getCodeBlockId: vi.fn(() => 'code-block-1') + getCodeBlockId: vi.fn(() => 'code-block-1'), + processLatexBrackets: vi.fn((str) => str) })) // Mock components with more realistic behavior @@ -212,16 +212,6 @@ describe('Markdown', () => { expect(markdown).not.toHaveTextContent('Paused') }) - it('should process content through format utilities', async () => { - const { escapeBrackets, removeSvgEmptyLines } = await import('@renderer/utils/formats') - const content = 'Content with [brackets] and SVG' - - render() - - expect(escapeBrackets).toHaveBeenCalledWith(content) - expect(removeSvgEmptyLines).toHaveBeenCalledWith(content) - }) - it('should match snapshot', () => { const { container } = render() expect(container.firstChild).toMatchSnapshot() diff --git a/src/renderer/src/utils/__tests__/formats.test.ts b/src/renderer/src/utils/__tests__/formats.test.ts index 5a817f45d9..09189b4526 100644 --- a/src/renderer/src/utils/__tests__/formats.test.ts +++ b/src/renderer/src/utils/__tests__/formats.test.ts @@ -6,7 +6,6 @@ import { describe, expect, it, vi } from 'vitest' import { addImageFileToContents, encodeHTML, - escapeBrackets, escapeDollarNumber, extractTitle, removeSvgEmptyLines, @@ -180,36 +179,6 @@ describe('formats', () => { }) }) - describe('escapeBrackets', () => { - it('should convert \\[...\\] to display math format', () => { - expect(escapeBrackets('The formula is \\[a+b=c\\]')).toBe('The formula is \n$$\na+b=c\n$$\n') - }) - - it('should convert \\(...\\) to inline math format', () => { - expect(escapeBrackets('The formula is \\(a+b=c\\)')).toBe('The formula is $a+b=c$') - }) - - it('should not affect code blocks', () => { - const codeBlock = 'This is text with a code block ```const x = \\[1, 2, 3\\]```' - expect(escapeBrackets(codeBlock)).toBe(codeBlock) - }) - - it('should not affect inline code', () => { - const inlineCode = 'This is text with `const x = \\[1, 2, 3\\]` inline code' - expect(escapeBrackets(inlineCode)).toBe(inlineCode) - }) - - it('should handle multiple occurrences', () => { - const input = 'Formula 1: \\[a+b=c\\] and formula 2: \\(x+y=z\\)' - const expected = 'Formula 1: \n$$\na+b=c\n$$\n and formula 2: $x+y=z$' - expect(escapeBrackets(input)).toBe(expected) - }) - - it('should handle empty string', () => { - expect(escapeBrackets('')).toBe('') - }) - }) - describe('extractTitle', () => { it('should extract title from HTML string', () => { const html = 'Page TitleContent' diff --git a/src/renderer/src/utils/__tests__/markdown.test.ts b/src/renderer/src/utils/__tests__/markdown.test.ts index e35550bf49..4f48deba0c 100644 --- a/src/renderer/src/utils/__tests__/markdown.test.ts +++ b/src/renderer/src/utils/__tests__/markdown.test.ts @@ -9,6 +9,7 @@ import { getCodeBlockId, getExtensionByLanguage, markdownToPlainText, + processLatexBrackets, removeTrailingDoubleSpaces, updateCodeBlock } from '../markdown' @@ -461,4 +462,198 @@ describe('markdown', () => { expect(markdownToPlainText('This is plain text.')).toBe('This is plain text.') }) }) + + describe('processLatexBrackets', () => { + describe('basic LaTeX conversion', () => { + it('should convert display math \\[...\\] to $$...$$', () => { + expect(processLatexBrackets('The formula is \\[a+b=c\\]')).toBe('The formula is $$a+b=c$$') + }) + + it('should convert inline math \\(...\\) to $...$', () => { + expect(processLatexBrackets('The formula is \\(a+b=c\\)')).toBe('The formula is $a+b=c$') + }) + }) + + describe('code block protection', () => { + it('should not affect multi-line code blocks', () => { + const input = 'Text ```const arr = \\[1, 2, 3\\]\\nconst func = \\(x\\) => x``` more text' + expect(processLatexBrackets(input)).toBe(input) + }) + + it('should not affect inline code', () => { + const input = 'This is text with `const x = \\[1, 2, 3\\]` inline code' + expect(processLatexBrackets(input)).toBe(input) + }) + + it('should handle mixed code and LaTeX', () => { + const input = 'Math: \\[x + y\\] and code: `arr = \\[1, 2\\]` and more math: \\(z\\)' + const expected = 'Math: $$x + y$$ and code: `arr = \\[1, 2\\]` and more math: $z$' + expect(processLatexBrackets(input)).toBe(expected) + }) + + it('should protect complex code blocks', () => { + for (const [input, expected] of new Map([ + [ + '```javascript\\nconst latex = "\\\\[formula\\\\]"\\n```', + '```javascript\\nconst latex = "\\\\[formula\\\\]"\\n```' + ], + ['`\\[escaped brackets\\]`', '`\\[escaped brackets\\]`'], + [ + '```\\narray = \\[\\n \\(item1\\),\\n \\(item2\\)\\n\\]\\n```', + '```\\narray = \\[\\n \\(item1\\),\\n \\(item2\\)\\n\\]\\n```' + ] + ])) { + expect(processLatexBrackets(input)).toBe(expected) + } + }) + }) + + describe('link protection', () => { + it('should not affect LaTeX in link text', () => { + const input = '[\\[pdf\\] Document](https://example.com/doc.pdf)' + expect(processLatexBrackets(input)).toBe(input) + }) + + it('should not affect LaTeX in link URLs', () => { + const input = '[Click here](https://example.com/path\\[with\\]brackets)' + expect(processLatexBrackets(input)).toBe(input) + }) + + it('should handle mixed links and LaTeX', () => { + const input = 'See [\\[pdf\\] file](url) for formula \\[x + y = z\\]' + const expected = 'See [\\[pdf\\] file](url) for formula $$x + y = z$$' + expect(processLatexBrackets(input)).toBe(expected) + }) + + it('should protect complex link patterns', () => { + for (const [input, expected] of new Map([ + ['[Title with \\(math\\)](https://example.com)', '[Title with \\(math\\)](https://example.com)'], + ['[Link](https://example.com/\\[path\\]/file)', '[Link](https://example.com/\\[path\\]/file)'], + [ + '[\\[Section 1\\] Overview](url) and \\[math formula\\]', + '[\\[Section 1\\] Overview](url) and $$math formula$$' + ] + ])) { + expect(processLatexBrackets(input)).toBe(expected) + } + }) + }) + + describe('edge cases', () => { + it('should handle empty string', () => { + expect(processLatexBrackets('')).toBe('') + }) + + it('should handle content without LaTeX', () => { + for (const [input, expected] of new Map([ + ['Regular text without math', 'Regular text without math'], + ['Text with [regular] brackets', 'Text with [regular] brackets'], + ['Text with (parentheses)', 'Text with (parentheses)'], + ['No special characters here', 'No special characters here'] + ])) { + expect(processLatexBrackets(input)).toBe(expected) + } + }) + + it('should handle malformed LaTeX patterns', () => { + for (const [input, expected] of new Map([ + ['\\[unclosed bracket', '\\[unclosed bracket'], + ['unopened bracket\\]', 'unopened bracket\\]'], + ['\\(unclosed paren', '\\(unclosed paren'], + ['unopened paren\\)', 'unopened paren\\)'], + ['\\[\\]', '$$$$'], // Empty LaTeX block + ['\\(\\)', '$$'] // Empty LaTeX inline + ])) { + expect(processLatexBrackets(input)).toBe(expected) + } + }) + + it('should handle nested brackets', () => { + for (const [input, expected] of new Map([ + ['\\[outer \\[inner\\] formula\\]', '$$outer \\[inner\\] formula$$'], + ['\\(a + \\(b + c\\)\\)', '$a + \\(b + c\\)$'] + ])) { + expect(processLatexBrackets(input)).toBe(expected) + } + }) + }) + + describe('complex cases', () => { + it('should handle complex mixed content', () => { + const complexInput = ` +# Mathematical Document + +Here's a simple formula \\(E = mc^2\\) in text. + +## Section 1: Equations + +The quadratic formula is \\[x = \\frac{-b \\pm \\sqrt{b^2-4ac}}{2a}\\]. + +- Item 1: See formula \\(\\alpha + \\beta = \\gamma\\) in this list +- Item 2: Check [\\[PDF\\] Complex Analysis](https://example.com/math.pdf) + - Subitem 2.1: Basic concepts and definitions + - Subitem 2.2: The Cauchy-Riemann equations \\[\\frac{\\partial u}{\\partial x} = \\frac{\\partial v}{\\partial y}, \\quad \\frac{\\partial u}{\\partial y} = -\\frac{\\partial v}{\\partial x}\\] + - Subitem 2.3: Green's theorem connects line integrals and double integrals + \\[ + \\oint_C (P dx + Q dy) = \\iint_D \\left(\\frac{\\partial Q}{\\partial x} - \\frac{\\partial P}{\\partial y}\\right) dx dy + \\] + - Subitem 2.4: Applications in engineering and physics +- Item 3: The sum \\[\\sum_{i=1}^{n} \\frac{1}{i^2} = \\frac{\\pi^2}{6}\\] is famous + +\`\`\`javascript +// Code should not be affected +const matrix = \\[ + \\[1, 2\\], + \\[3, 4\\] +\\]; +const func = \\(x\\) => x * 2; +\`\`\` + +Read more in [Section \\[3.2\\]: Advanced Topics](url) and see inline code \`\\[array\\]\`. + +Final thoughts on \\(\\nabla \\cdot \\vec{F} = \\rho\\) and display math: + +\\[\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}\\] +` + + const expectedOutput = ` +# Mathematical Document + +Here's a simple formula $E = mc^2$ in text. + +## Section 1: Equations + +The quadratic formula is $$x = \\frac{-b \\pm \\sqrt{b^2-4ac}}{2a}$$. + +- Item 1: See formula $\\alpha + \\beta = \\gamma$ in this list +- Item 2: Check [\\[PDF\\] Complex Analysis](https://example.com/math.pdf) + - Subitem 2.1: Basic concepts and definitions + - Subitem 2.2: The Cauchy-Riemann equations $$\\frac{\\partial u}{\\partial x} = \\frac{\\partial v}{\\partial y}, \\quad \\frac{\\partial u}{\\partial y} = -\\frac{\\partial v}{\\partial x}$$ + - Subitem 2.3: Green's theorem connects line integrals and double integrals + $$ + \\oint_C (P dx + Q dy) = \\iint_D \\left(\\frac{\\partial Q}{\\partial x} - \\frac{\\partial P}{\\partial y}\\right) dx dy + $$ + - Subitem 2.4: Applications in engineering and physics +- Item 3: The sum $$\\sum_{i=1}^{n} \\frac{1}{i^2} = \\frac{\\pi^2}{6}$$ is famous + +\`\`\`javascript +// Code should not be affected +const matrix = \\[ + \\[1, 2\\], + \\[3, 4\\] +\\]; +const func = \\(x\\) => x * 2; +\`\`\` + +Read more in [Section \\[3.2\\]: Advanced Topics](url) and see inline code \`\\[array\\]\`. + +Final thoughts on $\\nabla \\cdot \\vec{F} = \\rho$ and display math: + +$$\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$ +` + + expect(processLatexBrackets(complexInput)).toBe(expectedOutput) + }) + }) + }) }) diff --git a/src/renderer/src/utils/formats.ts b/src/renderer/src/utils/formats.ts index 559b4e7a52..ee64efd443 100644 --- a/src/renderer/src/utils/formats.ts +++ b/src/renderer/src/utils/formats.ts @@ -53,24 +53,6 @@ export function escapeDollarNumber(text: string) { return escapedText } -export function escapeBrackets(text: string) { - const pattern = /(```[\s\S]*?```|`.*?`)|\\\[([\s\S]*?[^\\])\\]|\\\((.*?)\\\)/g - return text.replace(pattern, (match, codeBlock, squareBracket, roundBracket) => { - if (codeBlock) { - return codeBlock - } else if (squareBracket) { - return ` -$$ -${squareBracket} -$$ -` - } else if (roundBracket) { - return `$${roundBracket}$` - } - return match - }) -} - export function extractTitle(html: string): string | null { if (!html) return null diff --git a/src/renderer/src/utils/markdown.ts b/src/renderer/src/utils/markdown.ts index a54e3d69d0..57025ca633 100644 --- a/src/renderer/src/utils/markdown.ts +++ b/src/renderer/src/utils/markdown.ts @@ -1,4 +1,5 @@ import { languages } from '@shared/config/languages' +import balanced from 'balanced-match' import remarkParse from 'remark-parse' import remarkStringify from 'remark-stringify' import removeMarkdown from 'remove-markdown' @@ -29,6 +30,85 @@ export const findCitationInChildren = (children: any): string => { return '' } +// 检查是否包含潜在的 LaTeX 模式 +const containsLatexRegex = /\\\(.*?\\\)|\\\[.*?\\\]|\$.*?\$|\\begin\{equation\}.*?\\end\{equation\}/ + +/** + * 转换 LaTeX 公式括号 `\[\]` 和 `\(\)` 为 Markdown 格式 `$$...$$` 和 `$...$` + * + * remark-math 本身不支持 LaTeX 原生语法,作为替代的一些插件效果也不理想。 + * + * 目前的实现: + * - 保护代码块和链接,避免被 remark-math 处理 + * - 支持嵌套括号的平衡匹配 + * - 转义 `\\(x\\)` 会被处理为 `\$x\$`,`\\[x\\]` 会被处理为 `\$$x\$$` + * + * @see https://github.com/remarkjs/remark-math/issues/39 + * @param text 输入的 Markdown 文本 + * @returns 处理后的字符串 + */ +export const processLatexBrackets = (text: string) => { + // 没有 LaTeX 模式直接返回 + if (!containsLatexRegex.test(text)) { + return text + } + + // 保护代码块和链接 + const protectedItems: string[] = [] + let processedContent = text + + processedContent = processedContent + // 保护代码块(包括多行代码块和行内代码) + .replace(/(```[\s\S]*?```|`[^`]*`)/g, (match) => { + const index = protectedItems.length + protectedItems.push(match) + return `__CHERRY_STUDIO_PROTECTED_${index}__` + }) + // 保护链接 [text](url) + .replace(/\[([^[\]]*(?:\[[^\]]*\][^[\]]*)*)\]\([^)]*?\)/g, (match) => { + const index = protectedItems.length + protectedItems.push(match) + return `__CHERRY_STUDIO_PROTECTED_${index}__` + }) + + // LaTeX 括号转换函数 + const processMath = (content: string, openDelim: string, closeDelim: string, wrapper: string): string => { + let result = '' + let remaining = content + + while (remaining.length > 0) { + const match = balanced(openDelim, closeDelim, remaining) + if (!match) { + result += remaining + break + } + + result += match.pre + result += `${wrapper}${match.body}${wrapper}` + remaining = match.post + } + + return result + } + + // 先处理块级公式,再处理内联公式 + let result = processMath(processedContent, '\\[', '\\]', '$$') + result = processMath(result, '\\(', '\\)', '$') + + // 还原被保护的内容 + result = result.replace(/__CHERRY_STUDIO_PROTECTED_(\d+)__/g, (match, indexStr) => { + const index = parseInt(indexStr, 10) + // 添加边界检查,防止数组越界 + if (index >= 0 && index < protectedItems.length) { + return protectedItems[index] + } + // 如果索引无效,保持原始匹配 + return match + }) + + return result +} + /** * 转换数学公式格式: * - 将 LaTeX 格式的 '\\[' 和 '\\]' 转换为 '$$$$'。 diff --git a/yarn.lock b/yarn.lock index eefde56f9c..75f9e2f434 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4115,6 +4115,13 @@ __metadata: languageName: node linkType: hard +"@types/balanced-match@npm:^3": + version: 3.0.2 + resolution: "@types/balanced-match@npm:3.0.2" + checksum: 10c0/833f6499609363537026c4ec2770af5c5a36e71b80f7b5b23884b15296301bfcf974cd40bc75fda940dea4994acd96c9222b284c248383a1ade59bf8835940b0 + languageName: node + linkType: hard + "@types/cacheable-request@npm:^6.0.1": version: 6.0.3 resolution: "@types/cacheable-request@npm:6.0.3" @@ -5639,6 +5646,7 @@ __metadata: "@testing-library/jest-dom": "npm:^6.6.3" "@testing-library/react": "npm:^16.3.0" "@tryfabric/martian": "npm:^1.2.4" + "@types/balanced-match": "npm:^3" "@types/diff": "npm:^7" "@types/fs-extra": "npm:^11" "@types/lodash": "npm:^4.17.5" @@ -5665,6 +5673,7 @@ __metadata: archiver: "npm:^7.0.1" async-mutex: "npm:^0.5.0" axios: "npm:^1.7.3" + balanced-match: "npm:^3.0.1" browser-image-compression: "npm:^2.0.2" color: "npm:^5.0.0" dayjs: "npm:^1.11.11" @@ -6297,6 +6306,13 @@ __metadata: languageName: node linkType: hard +"balanced-match@npm:^3.0.1": + version: 3.0.1 + resolution: "balanced-match@npm:3.0.1" + checksum: 10c0/ac8dd63a5b260610c2cbda982f436e964c1b9ae8764d368a523769da40a31710abd6e19f0fdf1773c4ad7b2ea7ba7b285d547375dc723f6e754369835afc8e9f + languageName: node + linkType: hard + "bare-events@npm:^2.2.0": version: 2.5.4 resolution: "bare-events@npm:2.5.4"