#安裝NLTK和相關包
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
#匯入檔案并改為NLTK適用文本,使用NLTK進行句子切分
with open ('D:\Python\my_corpus\TEM4_2005.txt',encoding='utf-8') as f:
text = f.read()
corpus_root = r'D:\Python\my corpus'
sent_tokenizer =PunktSentenceTokenizer(corpus_root,['TEM4.*\.txt'])
sents = sent_tokenizer.tokenize(text)#切分句子
#sents[1]
#print(sents)
#查詢單詞并匹配對應句子
A = input("請輸入單詞:")
B = input("請輸入單詞:")
for lines in sents:
if A in lines:
if B in lines:
print ("包含該單詞的句子有:",lines)
--------------------------------------------------------------
我是NLTK的初學者,我想實作的是對my_corpus里所有的語料文本進行單詞查詢。但是現在sents = sent_tokenizer.tokenize(text)只能讀取單個文本 with open 也只會呼叫單個文本。請問我改怎么改寫代碼
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/184899.html
