我有一個相當長的python流程,旨在重新處理一個表中的大量資料,提取必要的部分,分配適當的值,并將其重新輸入到另一個表中。 這個程序運行得很好! 除了它是令人難以置信的時間密集型。我想知道是否有辦法使用executemany方法來修改這個程序?這些是整個程序的片段,但劃出了我希望修改為executemany的地方
。下面的代碼通過一系列的if/then陳述句運行,為檢索到的資訊分配適當的值。
# Current crawl begin date
cur.execute('SELECT Crawl_Begin_Date FROM Crawl WHERE Crawl_ID=? , current_crawl_ID)
current_crawl_begin_date = cur.fetchone()
current_crawl_begin_date = current_crawl_begin_date[0]
# current crawl end date 當前的抓取結束日期
cur.execute('SELECT Crawl_End_Date FROM Crawl WHERE Crawl_ID=?'/span>, current_crawl_ID)
current_crawl_end_date = cur.fetchone()
current_crawl_end_date = current_crawl_end_date[0]
# URL_Crawl table where Crawl_ID == current crawl[/span>]
sql = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID=%s'
current_crawl = pd.read_sql_query(sql %(current_crawl_ID), con=db)
# num跟蹤從current_crawl讀出的行數(當前爬行的節點數)。
num = 1
# 對于當前抓取的每一個唯一的url
for row in current_crawl.itertuples()。
# 計算最大日期 .......................................................
if ((row.Last_Updated == None) | (row.Last_Updated == '))。
Last_Updated = '0')
else:
last_updated = row.Last_Updated
if ((row.Last_Published == None) | (row.Last_Published == '))。
Last_published = '0')
else:
last_published = row.Last_Published
if ((row.Date_of_HTML == None) | (row.Date_of_HTML == ')):
date_of_html = '0')
else:
date_of_html = row.Date_of_HTML
if ((last_updated >= last_published) & (last_updated >= date_of_html))。
max_date = last_updated
elif ((last_published >= last_updated) & (last_published >= date_of_html)) 。
max_date = last_published
elif ((date_of_html >= last_updated) & (date_of_html >= last_published)) 。
max_date = date_of_html
# ..........................................................................
# Set remaining variables from current_crawl dateframe[/span]。
url_crawl_id = row.URL_Crawl_ID
唯一的url_id = row.Unique_URL_ID
# Initialize starting and end dates/statuses with None?
starting_date = None[/span
starting_date_status = None None
ending_date = None[/span]。
ending_date_status = None
# URL_Crawl table up until (but not including) current crawl
sql2 = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID<%s'/span>
previous_crawls = pd.read_sql_query(sql2 %(current_crawl_ID), con=db)
# If row's unique_url_id exists in previous crawls (not a new node)
if (unique_url_id in (previous_crawls['Unique_URL_ID']).tolist()) 。
# Situation B ...................................................。
# 查找現有節點的最新壽命 # 尋找現有節點的最新壽命
existing = previous_crawls[previous_crawls['Unique_URL_ID'] == unique_url_id]
existing_url_crawl_ids = (existing.URL_Crawl_ID).tolist()
existing_in_lifetime = pd.DataFrame()
for i in existing_url_crawl_ids。
sql3 = 'SELECT * FROM Lifetime WHERE URL_Crawl_ID=%d'/span>
exist_in_lt = pd.read_sql_query(sql3 %(i), con=db)
existing_in_lifetime = existing_in_lifetime.append(existence_in_lt, ignore_index=True)
most_recent_lifetime = existing_in_lifetime[existing_in_lifetime.Lifetime_ID == existing_in_lifetime.Lifetime_ID.max()]
# Dates/statuses from most recent lifetime - convert to Strings.
most_recent_starting_date = ((most_recent_lifetime.Starting_Date).tolist()) [0]
most_recent_starting_date_status = ((most_recent_lifetime.Starting_Date_Status).tolist()) [0]
most_recent_ending_date = ((most_recent_lifetime.Ending_Date).tolist()) [0]
most_recent_ending_date_status = ((most_recent_lifetime.Ending_Date_Status).tolist()) [0]
most_recent_lifetimeID = ((most_recent_lifetime.Lifetime_ID).tolist()) [0]
if (max_date != '0')。
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)) 。
# 情況 B.2
ending_date = max_date
ending_date_status = "精確"。
cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
開始日期=max_date
starting_date_status = "精確的"。
ending_date = NoneNone
cur.execute(""INSERT INTO Lifetime VALUES (null, ? , ? , ? , ? , ? )
"", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)) 。
# 情況 B.3
Ending_date = max_date
ending_date_status = "精確"。
cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
開始日期=max_date
starting_date_status = "精確的"。
ending_date = current_crawl_begin_date
ending_date_status = "估計"
cur.execute(""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
"", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif (max_date == most_recent_starting_date)。
# 情況 B.4
ending_date = current_crawl_begin_date
ending_date_status = "估計"。
cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
elif ((max_date > current_crawl_end_date) | (max_date < most_recent_starting_date)) 。
# 情況 B.1
max_date = '0'/span>
if (max_date == '0') 。
# Situation B.5: (max_date == current_crawing'0).
ending_date = current_crawl_begin_date
ending_date_status = "估計"。
cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
# If row's unique_url_id is a new node (not seen in previous crawls)
else:
# Situation A ....................................................
if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)) 。
#情況A.2
start_date = max_date
starting_date_status = "精確"。
elif (max_date < current_crawl_begin_date)。
#情況A.3
start_date = max_date
starting_date_status = "精確"。
ending_date = current_crawl_begin_date
ending_date_status = "估計"
elif (max_date > current_crawl_end_date)。
# 情況 A.1
max_date = '0'/span>
if (max_date == '0') 。
# Situation A.4: (max_date == current_crawford)
starting_date = current_crawl_end_date
starting_date_status = "估計"。
cur.execute(""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
"", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id)
executemany能否以這種身份使用?如果可以的話,我不知道executemany的適當語法--我已經嘗試了一些東西,但還沒有成功。資料庫是SQLite,程式是基于python的。
uj5u.com熱心網友回復:
在沒有完全理解你的代碼之前,很難給出一個準確的答案。 我不太明白你在哪里迭代urls/ids/等等。 你要在回圈外為更新和插入建立一個串列,然后將引數序列累積到它們相應的串列中。 最后,在回圈之后,你將把每個串列與你想要執行的固定 SQL 一起傳遞給 executemany。
這應該能讓你了解它是如何通過回圈/迭代作業的。
。
#...
# These are each a list of tuples/lists.
# 即 [(param0, ..., paramN), ..., (param0, ..., paramN)]
params_to_update = []
params_to_insert=[]
# 對于當前抓取的每一個獨特的URL
for row in current_crawl.itertuples()。
#.../span>
if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)) 。
# 情況 B.2
ending_date = max_date
ending_date_status = "精確"。
params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
開始日期=最大日期
starting_date_status = "精確"。
ending_date = NoneNone 終止日期 = None
params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)) 。
# 情況 B.3
Ending_date = max_date
ending_date_status = "精確"。
params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
開始日期=最大日期
starting_date_status = "Exact""估計"
params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
# 在for回圈完成后。
# 為這個串列中的每個引數序列呼叫UPDATE。
UPDATE_SQL = ""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? WHERE Lifetime_ID=?""
cur.executemany(UPDATE_SQL, params_to_update)
# 為這個串列中的每個引數序列呼叫INSERT。
INSERT_SQL = ""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?)""
cur.executemany(INSERT_SQL, params_to_insert)
stackoverflow.com。using-executemany-to-updateentries-in-existing-sqlite3-databaseusing-pyt
docs.python.org: python doc executemany 示例
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/309815.html
標籤:
