我有一個(可能)包含 HTML 標簽的字串。
我想根據(文本)字符長度將它拆分成更小的有效 HTML 字串。用例本質上是分頁。我知道可以放在一頁上的文本長度。所以我想根據該字符長度將目標字串分成“塊”或頁面。但我需要每個生成的頁面都包含有效的 HTML,而沒有未關閉的標簽等。
例如:
const pageCharacterSize = 10
const testString = 'some <strong>text with HTML</strong> tags
function paginate(string, pageSize) { //@TODO }
const pages = paginate(testString, pageCharacterSize)
console.log(pages)
// ['some <strong>text </strong>', '<strong>with HTML</strong> ', 'tags']
我認為這可能與DocumentFragment或Range有關,但我無法弄清楚如何根據字符偏移量對頁面進行切片。
這個 MDN 頁面有一個演示,可以做一些接近我需要的事情。但它使用caretPositionFromPoint()which takes X,Y坐標作為引數。
更新
為了清楚起見,以下是我正在使用的測驗:
import { expect, test } from 'vitest'
import paginate from './paginate'
// 1
test('it should chunk plain text', () => {
// a
const testString = 'aa bb cc dd ee';
const expected = ['aa', 'bb', 'cc', 'dd', 'ee']
expect(paginate(testString, 2)).toStrictEqual(expected)
// b
const testString2 = 'a a b b c c';
const expected2 = ['a a', 'b b', 'c c']
expect(paginate(testString2, 3)).toStrictEqual(expected2)
// c
const testString3 = 'aa aa bb bb cc cc';
const expected3 = ['aa aa', 'bb bb', 'cc cc']
expect(paginate(testString3, 5)).toStrictEqual(expected3)
// d
const testString4 = 'aa bb cc';
const expected4 = ['aa', 'bb', 'cc']
expect(paginate(testString4, 4)).toStrictEqual(expected4)
// e
const testString5 = 'a b c d e f g';
const expected5 = ['a b c', 'd e f', 'g']
expect(paginate(testString5, 5)).toStrictEqual(expected5)
// f
const testString6 = 'aa bb cc';
const expected6 = ['aa bb', 'cc']
expect(paginate(testString6, 7)).toStrictEqual(expected6)
})
// 2
test('it should chunk an HTML string without stranding tags', () => {
const testString = 'aa <strong>bb</strong> <em>cc dd</em>';
const expected = ['aa', '<strong>bb</strong>', '<em>cc</em>', '<em>dd</em>']
expect(paginate(testString, 3)).toStrictEqual(expected)
})
// 3
test('it should handle tags that straddle pages', () => {
const testString = '<strong>aa bb cc</strong>';
const expected = ['<strong>aa</strong>', '<strong>bb</strong>', '<strong>cc</strong>']
expect(paginate(testString, 2)).toStrictEqual(expected)
})
uj5u.com熱心網友回復:
這是一個假定并支持以下內容的解決方案:
- 沒有屬性的標簽(你可以調整正則運算式來支持它)
- 假設格式良好的標簽,例如不是:
<b><i>wrong nesting</b></i>,,missing <b>end tagmissing start</b> tag - 標簽可以嵌套
- 標簽被洗掉并稍后恢復為每頁計數的正確字符
- 頁面拆分是通過向后查找第一個空間來完成的
function paginate(html, pageSize) {
let splitRegex = new RegExp('\\s*[\\s\\S]{1,' pageSize '}(?!\\S)', 'g');
let tagsInfo = []; // saved tags
let tagOffset = 0; // running offset of tag in plain text
let pageOffset = 0; // page offset in plain text
let openTags = []; // open tags carried over to next page
let pages = html.replace(/<\/?[a-z][a-z0-9]*>/gi, (tag, pos) => {
let obj = { tag: tag, pos: pos - tagOffset };
tagsInfo.push(obj);
tagOffset = tag.length;
return '';
}).match(splitRegex).map(page => {
let nextOffset = pageOffset page.length;
let prefix = openTags.join('');
tagsInfo.slice().reverse().forEach(obj => {
if(obj.pos >= pageOffset && obj.pos < nextOffset) {
// restore tags in reverse order to maintain proper position
page = page.substring(0, obj.pos - pageOffset) obj.tag page.substring(obj.pos - pageOffset);
}
});
tagsInfo.forEach(obj => {
let tag = obj.tag;
if(obj.pos >= pageOffset && obj.pos < nextOffset) {
if(tag.match(/<\//)) {
// remove tag from openTags list
tag = tag.replace(/<\//, '<');
let index = openTags.indexOf(tag);
if(index >= 0) {
openTags.splice(index, 1);
}
} else {
// add tag to openTags list
openTags.push(tag);
}
}
});
pageOffset = nextOffset;
let postfix = openTags.slice().reverse().map(tag => tag.replace(/</, '</')).join('');
page = prefix page.trim() postfix;
return page.replace(/<(\w )><\/\1>/g, ''); // remove tags with empty content
});
return pages;
}
[
{ str: 'some <strong>text <i>with</i> HTML</strong> tags, and <i>some <b>nested tags</b> sould be <b>supported</b> as well</i>.', size: 16 },
{ str: 'a a b b c c', size: 3 },
{ str: 'aa aa bb bb cc cc', size: 5 },
{ str: 'aa bb cc', size: 4 },
{ str: 'aa <strong>bb</strong> <em>cc dd</em>', size: 3 },
{ str: '<strong>aa bb cc</strong>', size: 2 }
].forEach(o => {
let pages = paginate(o.str, o.size);
console.log(pages);
});
輸出:
[
"some <strong>text <i>with</i></strong>",
"<strong> HTML</strong> tags, and",
"<i>some <b>nested tags</b></i>",
"<i> sould be</i>",
"<i><b>supported</b> as</i>",
"<i>well</i>."
]
[
"a a",
"b b",
"c c"
]
[
"aa aa",
"bb bb",
"cc cc"
]
[
"aa",
"bb",
"cc"
]
[
"aa",
"<strong>bb</strong>",
" <em>cc</em>",
"<em>dd</em>"
]
[
"<strong>aa</strong>",
"<strong>bb</strong>",
"<strong>cc</strong>"
]
更新
根據評論中的新請求,我將拆分正則運算式從'[\\s\\S]{1,' pageSize '}(?!\\S)'to修復'\\s*[\\s\\S]{1,' pageSize '}(?!\\S)',例如添加\\s*以捕獲前導空格。我還添加了一個page.trim()洗掉前導空格。最后我添加了一些 OP 示例。
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/537000.html
