我已將兩個 .pdf 檔案決議為兩個 .json 檔案,以便能夠讀取它們的內容。為此,我使用了 pdf2json (我已經分別決議了每個檔案):
const fs = require("fs")
const PDFParser = require("pdf2json")
const pdfParser = new PDFParser();
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile("./json/first.json", JSON.stringify(pdfData), () => {});
});
pdfParser.loadPDF("./pdf/first.pdf");
first.json 的結構如下:
{
"Transcoder": "[email protected] [https://github.com/modesty/pdf2json]",
"Meta": {...}
"Pages": [
{
"Texts": [
{
"x": 3.95,
"y": 4.102,
"w": 3000.893,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some Awesome Text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 22.895,
"y": 4.05,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some more important awesome text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 38.755,
"y": 6.02,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 69.6868,
"y": 69.6868,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "text I do not want",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
}
]
}
]
}
第二個類似的檔案 second.json:
{
"Transcoder": "[email protected] [https://github.com/modesty/pdf2json]",
"Meta": {...}
"Pages": [
{
"Texts": [
{
"x": 3.85,
"y": 4.052,
"w": 3000.893,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some Awesome Text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 22.855,
"y": 4.12,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some more important awesome text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 38.855,
"y": 6.12,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 69.6969,
"y": 69.6969,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "text I do not want",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
}
]
}
]
}
請注意,這些檔案之間的"x"和"y"變數相差約 0.1。
這很重要,因為我想"T"從兩個檔案中獲取內容:"Some Awesome Text"和"Some more important awesome text". 的內容"T"可以變化,所以我不能依賴它。我只能通過檢查它是否在范圍內來獲取"x"內容"y"。
現在我的問題是,如何實作它?
我的想法是制作一個嵌套的 for 回圈,它會檢查"y"它們"x"是否在范圍內,但是我應該如何獲得"T"?
解決方案(感謝@timpa 和其他 SO 用戶!):
const PDFParser = require("pdf2json")
const extractPDF = () => {
return new Promise((resolve, reject) => {
const pdfParser = new PDFParser();
pdfParser.loadPDF("./pdf/first.pdf");
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError))
pdfParser.on("pdfParser_dataReady", (pdfData, err ) => {
if (err) return console.log(err)
resolve(pdfData)
})
})
}
const getTargetContent = jsonContent => {
const data = {}
jsonContent.Pages[0].Texts.forEach(text => {
if(4.002 < text.y && text.y < 4.202) {
if(3.85 < text.x && text.x < 4.05) {
data.content = decodeURI(text.R[0].T)
}
}
})
return data
}
const main = async () => {
const pdfContent = await extractPDF()
const targetContent = getTargetContent(pdfContent)
console.log(targetContent)
}
main()
uj5u.com熱心網友回復:
我認為您甚至不需要嵌套回圈。只是做類似的事情
obj.Pages[0].Texts.forEach(text => {
// check text.x, text.y, and text.R[0].T
})
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/465848.html
標籤:javascript 节点.js json
