我正在嘗試讀取以 | 分隔的 4 個 .txt 檔案。
因為其中之一超過 1Gb df_tradeCash_mhi = pd.concat(chunk_read(mhi_tradeCashFiles, "MHI"))
我找到了讀取它們的“塊”方法,但我收到錯誤標記資料。記憶體不足。
有誰知道我該如何解決這個問題?
下面是我的代碼
def findmefile(directory, containsInFilename):
entity_filenames = {}
for file in os.listdir(directory):
if containsInFilename in file:
if file[:5] == "Trade":
entity_filenames["MHI"] = file
else:
entity_filenames[re.findall("(.*?)_", file)[0]] = file
return entity_filenames
# Get the core Murex file names
mhi_tradeFiles = findmefile(CoreMurexFilesLoc, "Trade")
mhi_tradeCashFiles = findmefile(CoreMurexFilesLoc, "TradeCash_")
mheu_tradeFiles = findmefile(CoreMurexFilesLoc, "MHEU")
mheu_tradeCashFiles = findmefile(CoreMurexFilesLoc, "MHEU_TradeCash")
# Read the csv using chunck
mylist = []
size = 10**2
def chunk_read(fileName, entity):
for chunk in pd.read_csv(
CoreMurexFilesLoc "\\" fileName[entity],
delimiter="|",
low_memory=False,
chunksize=size,
):
mylist.append(chunk)
return mylist
df_trade_mhi = pd.concat(chunk_read(mhi_tradeFiles, "MHI"))
df_trade_mheu = pd.concat(chunk_read(mheu_tradeFiles, "MHEU"))
df_tradeCash_mheu = pd.concat(chunk_read(mheu_tradeCashFiles, "MHEU"))
df_tradeCash_mhi = pd.concat(chunk_read(mhi_tradeCashFiles, "MHI"))
df_trades = pd.concat(
[df_trade_mheu, df_trade_mhi, df_tradeCash_mheu, df_tradeCash_mhi]
)
del df_trade_mhi
del df_tradeCash_mhi
del df_trade_mheu
del df_tradeCash_mheu
# Drop any blank fields and duplicates
nan_value = float("NaN")
df_trades.replace("", nan_value, inplace=True)
df_trades.dropna(subset=["MurexCounterpartyRef"], inplace=True)
df_trades.drop_duplicates(subset=["MurexCounterpartyRef"], inplace=True)
counterpartiesList = df_trades["MurexCounterpartyRef"].tolist()
print(colored('All Core Murex trade and tradeCash data loaded.', "green"))
錯誤:
Traceback (most recent call last):
File "h:\DESKTOP\test_check\check_securityPrices.py", line 52, in <module>
df_tradeCash_mhi = pd.concat(chunk_read(mhi_tradeCashFiles, "MHI"))
File "h:\DESKTOP\test_check\check_securityPrices.py", line 39, in chunk_read
for chunk in pd.read_csv(
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\readers.py", line 1024, in __next__
return self.get_chunk()
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\readers.py", line 1074, in get_chunk
return self.read(nrows=size)
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\readers.py", line 1047, in read
index, columns, col_dict = self._engine.read(nrows)
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 228, in read
data = self._reader.read(nrows)
File "pandas\_libs\parsers.pyx", line 783, in pandas._libs.parsers.TextReader.read
File "pandas\_libs\parsers.pyx", line 857, in pandas._libs.parsers.TextReader._read_rows
File "pandas\_libs\parsers.pyx", line 843, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas\_libs\parsers.pyx", line 1925, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: out of memory
uj5u.com熱心網友回復:
我認為問題很明顯 - 您的記憶體不足,因為您試圖一次將如此多的資料加載到記憶體中,然后對其進行處理。
您需要:
- 買一臺記憶體更大的機器。
- 使用生成器或協程管道重新構建解決方案以使用流水線方法對資料進行逐步處理。
第一種方法的問題是它不會無限擴展并且很昂貴。第二種方法是正確的方法,但需要更多的編碼。
作為生成器/協程型別管道方法的一個很好的參考,請查看 David Beazley 的任何 pycon 演講。
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/355833.html
下一篇:使用bash-c逐行決議
