按屬性陣列拆分文本字串-有解無憂

我有一個文本塊，它應該具有不同型別的格式。文本和格式化標簽分別存盤在字串和陣列中。我想創建一個將 2 結合在一起的資料結構。以下是實際作業資料：

格式化文本：

此文本是粗體，而這是斜體，這也是粗體。

文字串：

this text is bold while this is italic and this is bold too.

格式化標簽陣列：

{
  "bold":[
    [13,16],
    [51,54]
  ],
  "italic":[
    [32,37]
  ]
}

請注意，格式標簽陣列包含不同型別格式化文本的開始和結束。

現在的問題是：我如何合并這兩種型別的資訊來創建一個包含這兩種型別的好物件。可以轉換為 HTML 和 Markdown 的通用格式將不勝感激。我在想：

[
  {text:"this text is ", tags: []}.
  {text: "bold", tags: ["bold"]},
  {text: "while this is ", tags: []},
  {text: "italic", tags: ["italic"]},
  {text: " and this is ", tags: []},
  {text: "bold", tags: ["bold"]},
]

另請注意，一個文本切片可能有多個標簽。

uj5u.com熱心網友回復：

可能的解決方案：

/*
Groups range tags
@param {Object} tagsMap
@returns {Object} rangeTagsMap
*/
const _groupRangeTags = tagsMap => 
  Object.entries(tagsMap).reduce((map, [tag, ranges]) => {
    ranges.forEach(range => map[range] = [...(map[range] ?? []), tag]);   
    return map;
  }, {});
  
/*
Lists sorted range objects with tags
@param {Object} rangeTagsMap
@returns {Array} rangeTagsList
*/
const _listRangesWithTags = rangeTagsMap =>
  Object.entries(rangeTagsMap)
    .map(([range, tags]) => {
      const [start, end] = range.split(',');
      return { start:  start, end:  end, tags };
    })
    .sort(({ start: a }, { start: b }) => a - b);

/*
Returns range objects with tags including the ones without tags
@param {Array} rangeTagsList
@param {String} str
@returns {Array} strRangeTagsList
*/
const _fillRangesWithoutTags = (rangeTagsList, str) => {
  const strRangeTagsList = [];
  if(rangeTagsList.length === 0) {
    strRangeTagsList.push({ start: 0, end: str.length, tags: [] });
  } 
  for (i = 0; i < rangeTagsList.length; i  ) {
    const current = rangeTagsList[i], next = rangeTagsList[i 1];
    strRangeTagsList.push(current);
    if(i === 0 && current.start !== 0) {
      strRangeTagsList.unshift({ start: 0, end: current.start-1, tags: [] });
    }
    if (next && current.end != next.start) {
      strRangeTagsList.push({ start: current.end 1, end: next.start-1, tags: [] });
    }
    if(i === rangeTagsList.length-1 && current.end !== str.length-1) {
      strRangeTagsList.push({ start: current.end 1, end: str.length, tags: [] });
    }
  }
  return strRangeTagsList;
}

/*
Returns string range objects with text and tags
@param {Array} strRangeTagsList
@param {String} str
@returns {Array} strRanges
*/
const _getTextRanges = (strRangeTagsList, str) => strRangeTagsList.map(({ start, end, tags }) => ({ 
  text: str.substring(start, end 1), tags 
}));

/*
@param {String} str
@param {Object} tagsMap
@returns {Array} strRanges
*/
const _getRanges = (str, tagsMap = {}) => {
  const rangeTagsMap = _groupRangeTags(tagsMap);
  const rangeTagsList = _listRangesWithTags(rangeTagsMap);
  const strRangeTagsList = _fillRangesWithoutTags(rangeTagsList, str);
  return _getTextRanges(strRangeTagsList, str);
}
  
console.log( _getRanges('this text is bold while this is italic and this is bold too.', { bold: [ [13,16], [51,54] ], italic: [ [32,37] ] }) );
console.log( _getRanges('this is bold while this is both bold and italic.', { bold: [[8,11],[32,46]], italic: [[32,46]] }) );

uj5u.com熱心網友回復：

這是一個處理簡單重疊標簽的解決方案。

它相當密集，下面有部分解釋。

const restructure = (text, tags) =>
  Object .entries (
    Object .entries (tags) .reduce ((a, [tag, blocks]) => blocks .reduce ((a, [start, end]) => ({
      ... a,
      [start]: {... ({...a [start] || {open: [], close: []}}), open: [... ((a [start] || {}).open || []), tag]},
      [end   1]: {... ({...a [end   1] || {open: [], close: []}}), close: [... ((a [end   1] || {}).close || []), tag]}
    }), a), {0: {close: [], open: []}, [text.length - 1]: {close: [], open: []}})
  ) .map (([k, v]) => [Number (k), v]) .sort  (([a], [b]) => a - b) 
    .reduce (
      (a, y, i, arr, x = arr [i - 1], 
        tags = i == 0 ? [] : [... new Set(a .open .filter (tag => ! x [1] .close .includes (tag)) .concat (x [1] .open || []))]
      ) => i == 0 ? a : {open: tags, blocks: a .blocks .concat ({text: text .slice (x [0], y [0]), tags})}, 
      {open: [], blocks: []}
    )
    .blocks

const testCases = [
  {
    text: "this text is bold while this is italic and this is bold too.", 
    tags: {bold: [[13, 16], [51, 54]], italic: [[32, 37]]}
  }, {
    text: "this text is bold while this is italic and this is bold too.", 
    tags: {bold: [[13, 16], [51, 54]], red: [[29, 33]], italic: [[32, 37]]}
  }, {
    text: "There's bold and italic then just italic and then there's neither.", 
    tags: {bold: [[8, 11]], italic: [[8, 49]]}
  }
]

testCases .forEach (({text, tags}) => {
  console .log (`Text: `, text)
  console .log (`Tags: `, tags)
  console .log (`Result: `, restructure (text, tags))
})

.as-console-wrapper {max-height: 100% !important; top: 0}

我們首先呼叫Object .entries (tags)，然后reduce對結果進行操作。如果我們從文本開始

"this text is bold while this is italic and this is bold too."

和標簽

{"bold":[[13,16],[51,54]],"italic":[[32,37]]}

這將產生

{
  "0": {close: [], open: []},
  "13": {close: [], open: ["bold"]},
  "17": {close: ["bold"], open: []},
  "32": {close: [], open: ["italic"]},
  "38": {close: ["italic"], open: []},
  "51": {close: [], open: ["bold"]},
  "55": {close: ["bold"], open: []},
  "59": {close: [], open: []}
}

Note that we shifted the end indices up by one, as the norm in JS is for the left index to be inclusive and the right one to be exclusive. "close" and "open" here are meant to indicate events we will process, and not current states.

We then wrap that up with another Object .entries and follow it with map and sort calls. The map is not actually necessary, and I would probably remove it in the end, but it feels cleaner than sorting numeric strings through subtraction.

That now turns this into an ordered list of event groups:

[
  [0, {"close": [], "open": []}],
  [13, {"close": [], "open": ["bold"]}],
  [17, {"close": ["bold"], "open": []}],
  [32, {"close": [], "open": ["italic"]}], 
  [38, {"close": ["italic"], "open": []}], 
  [51, {"close": [], "open": ["bold"]}],
  [55, {"close": ["bold"], "open": []}],
  [59, {"close": [], "open": []}]
]

Note that although this example only has single values tied with any index, a more complex scenario could have several opens and several closes in one place.

We do a second reduce on these, starting with an initial accumulator of {open: [], blocks: []}, keeping track of the collection of tags that are open, and appending to blocks on each event, leading to:

{
  blocks: [
    {tags: [],         text: "this text is "}, 
    {tags: ["bold"],   text: "bold"}, 
    {tags: [],         text: " while this is "}, 
    {tags: ["italic"], text: "italic"}, 
    {tags: [],         text: " and this is "}, 
    {tags: ["bold"],   text: "bold"}, 
    {tag": [],         text: " too"}
  ], 
  open: []
}

This bit

[... new Set(a .open .filter (tag => ! x [1] .close .includes (tag)) .concat (x [1] .open || []))]

first removes from the accumulator's open array all tags that are in the current close list, then adds all the ones in the current open list, and uses [...new Set (/* ... */)] to reduce to a list of unique values.

And then finally we simply return the blocks property of this accumulator object.

Syntactically, I know, it's still dense. If I find time soon, I will try to write a more imperative -- and probably much longer -- version of this.

Note that this will handle all sorts of overlap. But it will not handle nested versions of the same tag. If you have {bold: [[12, 20], [15, 30]]} it will likely close off all bold at the end of the first one (that is, at 20, not 30.) This is fixable using queues instead of simple arrays of opened tags, but is quite a bit more work, I would think.

轉載請註明出處，本文鏈接：https://www.uj5u.com/qukuanlian/434655.html

標籤：javascript 算法

上一篇：索引16超出長度16的范圍

下一篇：如何提高追加然后檢查串列中后續元素的效率