我正在嘗試從 tif 檔案中獲取資料,將其存盤data并創建一個熊貓df0
data = []
listOfPages = glob.glob(r"C:/Users/name/*.tif")
for entry in listOfPages:
text = pytesseract.image_to_string(
Image.open(entry), lang="en"
)
data.append(text)
duck1 = re.compile(r'(CHE)(.*)\\n', flags = re.DOTALL | re.MULTILINE)
asker1 = re.compile(r'(my|the)\s zzzz(.*)office', flags = re.DOTALL | re.MULTILINE)
date1 = re.compile(r'\s dfg(\d{2}\.\d{2}\.\d{4})', flags = re.DOTALL | re.MULTILINE)
th1 = re.compile(r'(gh|gh)\s fg\s sdf(.*)(rrr,\s rtr)', flags = re.DOTALL | re.MULTILINE)
frage1 = re.compile(r'(\\neee)(.*)(we|wzz)\s drte\s Srrr:', flags = re.DOTALL | re.MULTILINE)
try:
d2 = duck1.search(text)
if d2:
dru = d2.group(1)
else:
dru = None
except:
pass
try:
asker2 = asker1.search(text)
if asker2:
asker = asker2.group(1)
else:
asker = None
except:
pass
try:
date2 = date1.search(text)
if date2:
datr = date2.group(0)
else:
datr = None
except:
pass
try:
thema2 = thema1.search(text)
if thema2:
thema = thema2.group(1)
else:
thema = None
except:
pass
try:
frage2 = frage1.search(text)
if frage2:
frage = frage2.group(1)
else:
frage = None
except:
pass
data.append([text, dru, asker, datr, thema, frage])
df0 = pd.DataFrame(data, columns =['raw_text', 'wer', 'asker', 'date', 'area', 'que_text'])
print(df0)
ValueError:傳遞的值的形狀是 (20, 1),索引意味著 (20, 6)
我究竟做錯了什么?我理解錯誤(從閱讀相同的主題但不同的場景),應該有重疊的索引,我需要在追加之前洗掉?
uj5u.com熱心網友回復:
你只需要停止追加text最初data因為你是在最后的名單的回圈的做這件事,這將創建data的串列與形狀(20,6),你需要一個串列。
for entry in listOfPages:
text = pytesseract.image_to_string(
Image.open(entry), lang="en"
)
#data.append(text)
duck1 = re.compile(r'(CHE)(.*)\\n', flags = re.DOTALL | re.MULTILINE)
asker1 = re.compile(r'(my|the)\s zzzz(.*)office', flags = re.DOTALL | re.MULTILINE)
date1 = re.compile(r'\s dfg(\d{2}\.\d{2}\.\d{4})', flags = re.DOTALL | re.MULTILINE)
th1 = re.compile(r'(gh|gh)\s fg\s sdf(.*)(rrr,\s rtr)', flags = re.DOTALL | re.MULTILINE)
frage1 = re.compile(r'(\\neee)(.*)(we|wzz)\s drte\s Srrr:', flags = re.DOTALL | re.MULTILINE)
try:
d2 = duck1.search(text)
if d2:
dru = d2.group(1)
else:
dru = None
except:
pass
try:
asker2 = asker1.search(text)
if asker2:
asker = asker2.group(1)
else:
asker = None
except:
pass
try:
date2 = date1.search(text)
if date2:
datr = date2.group(0)
else:
datr = None
except:
pass
try:
thema2 = thema1.search(text)
if thema2:
thema = thema2.group(1)
else:
thema = None
except:
pass
try:
frage2 = frage1.search(text)
if frage2:
frage = frage2.group(1)
else:
frage = None
except:
pass
data.append([text, dru, asker, datr, thema, frage])
df0 = pd.DataFrame(data, columns =['raw_text', 'wer', 'asker', 'date', 'area', 'que_text'])
print(df0)
這應該可以正常作業。
轉載請註明出處,本文鏈接:https://www.uj5u.com/ruanti/322288.html
