我可以用executemany來處理帶有sqlite3的大批量程式嗎？ -有解無憂

我有一個相當長的python流程，旨在重新處理一個表中的大量資料，提取必要的部分，分配適當的值，并將其重新輸入到另一個表中。這個程序運行得很好! 除了它是令人難以置信的時間密集型。我想知道是否有辦法使用executemany方法來修改這個程序？這些是整個程序的片段，但劃出了我希望修改為executemany的地方

。

下面的代碼通過一系列的if/then陳述句運行，為檢索到的資訊分配適當的值。

 # Current crawl begin date
    cur.execute('SELECT Crawl_Begin_Date FROM Crawl WHERE Crawl_ID=? , current_crawl_ID)
    current_crawl_begin_date = cur.fetchone()
    current_crawl_begin_date = current_crawl_begin_date[0]
    
    # current crawl end date 當前的抓取結束日期
    cur.execute('SELECT Crawl_End_Date FROM Crawl WHERE Crawl_ID=?'/span>, current_crawl_ID)
    current_crawl_end_date = cur.fetchone()
    current_crawl_end_date = current_crawl_end_date[0]
    
    # URL_Crawl table where Crawl_ID == current crawl[/span>]
    sql = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID=%s'
    current_crawl = pd.read_sql_query(sql %（current_crawl_ID）, con=db)

    # num跟蹤從current_crawl讀出的行數（當前爬行的節點數）。
    num = 1

    # 對于當前抓取的每一個唯一的url
    for row in current_crawl.itertuples()。
        
        # 計算最大日期 .......................................................
        if ((row.Last_Updated == None) | (row.Last_Updated == '))。
            Last_Updated = '0')
        else:
            last_updated = row.Last_Updated
        if ((row.Last_Published == None) | (row.Last_Published == '))。
            Last_published = '0')
        else:
            last_published = row.Last_Published
        if ((row.Date_of_HTML == None) | (row.Date_of_HTML == ')):
            date_of_html = '0')
        else:
            date_of_html = row.Date_of_HTML
            
        if ((last_updated >= last_published) & (last_updated >= date_of_html))。
            max_date = last_updated
        elif ((last_published >= last_updated) & (last_published >= date_of_html)) 。
            max_date = last_published
        elif ((date_of_html >= last_updated) & (date_of_html >= last_published)) 。
            max_date = date_of_html
        # ..........................................................................
        
        # Set remaining variables from current_crawl dateframe[/span]。
        url_crawl_id = row.URL_Crawl_ID
        唯一的url_id = row.Unique_URL_ID

        # Initialize starting and end dates/statuses with None?
        starting_date = None[/span
        starting_date_status = None None
        ending_date = None[/span]。
        ending_date_status = None
        
        # URL_Crawl table up until (but not including) current crawl
        sql2 = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID<%s'/span>
        previous_crawls = pd.read_sql_query(sql2 %（current_crawl_ID）, con=db)

        # If row's unique_url_id exists in previous crawls (not a new node) 
        if (unique_url_id in (previous_crawls['Unique_URL_ID']).tolist()) 。
            
            # Situation B ...................................................。
            
            # 查找現有節點的最新壽命 # 尋找現有節點的最新壽命
            existing = previous_crawls[previous_crawls['Unique_URL_ID'] == unique_url_id]
            existing_url_crawl_ids = (existing.URL_Crawl_ID).tolist()
            
            existing_in_lifetime = pd.DataFrame()
            
            for i in existing_url_crawl_ids。
                sql3 = 'SELECT * FROM Lifetime WHERE URL_Crawl_ID=%d'/span>
                exist_in_lt = pd.read_sql_query(sql3 %（i）, con=db)
                existing_in_lifetime = existing_in_lifetime.append(existence_in_lt, ignore_index=True)
            
            most_recent_lifetime = existing_in_lifetime[existing_in_lifetime.Lifetime_ID == existing_in_lifetime.Lifetime_ID.max（）]
                
            # Dates/statuses from most recent lifetime - convert to Strings.
            most_recent_starting_date = ((most_recent_lifetime.Starting_Date).tolist()) [0]
            most_recent_starting_date_status = ((most_recent_lifetime.Starting_Date_Status).tolist()) [0]
            most_recent_ending_date = ((most_recent_lifetime.Ending_Date).tolist()) [0]
            most_recent_ending_date_status = ((most_recent_lifetime.Ending_Date_Status).tolist()) [0]
            most_recent_lifetimeID = ((most_recent_lifetime.Lifetime_ID).tolist()) [0]
                
            if (max_date != '0')。
                if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)) 。
                    # 情況 B.2
                    ending_date = max_date
                    ending_date_status = "精確"。
                    cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
                                WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
                    開始日期=max_date
                    starting_date_status = "精確的"。
                    ending_date = NoneNone
                    cur.execute(""INSERT INTO Lifetime VALUES (null, ? , ? , ? , ? , ? )
                                "", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
                elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)) 。
                    # 情況 B.3
                    Ending_date = max_date
                    ending_date_status = "精確"。
                    cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
                                WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
                    開始日期=max_date
                    starting_date_status = "精確的"。
                    ending_date = current_crawl_begin_date
                    ending_date_status = "估計"
                    cur.execute(""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
                                "", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
                elif （max_date == most_recent_starting_date）。
                    # 情況 B.4
                    ending_date = current_crawl_begin_date
                    ending_date_status = "估計"。
                    cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
                                WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
                elif ((max_date > current_crawl_end_date) | (max_date < most_recent_starting_date)) 。
                    # 情況 B.1
                    max_date = '0'/span>
            if (max_date == '0') 。
                # Situation B.5: (max_date == current_crawing'0).
                ending_date = current_crawl_begin_date
                ending_date_status = "估計"。
                cur.execute(""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? 
                            WHERE Lifetime_ID=?"", (ending_date, ending_date_status, most_recent_lifetimeID))
                    
        # If row's unique_url_id is a new node (not seen in previous crawls) 
        else:
            
            # Situation A ....................................................
            
            if (max_date != '0'):
                if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)) 。
                    #情況A.2
                    start_date = max_date
                    starting_date_status = "精確"。
                elif （max_date < current_crawl_begin_date）。
                    #情況A.3
                    start_date = max_date
                    starting_date_status = "精確"。
                    ending_date = current_crawl_begin_date
                    ending_date_status = "估計"
                elif （max_date > current_crawl_end_date）。
                    # 情況 A.1
                    max_date = '0'/span>
            if (max_date == '0') 。
                # Situation A.4: (max_date == current_crawford)
                starting_date = current_crawl_end_date
                starting_date_status = "估計"。
        
            cur.execute(""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
                        "", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id)

executemany能否以這種身份使用？如果可以的話，我不知道executemany的適當語法--我已經嘗試了一些東西，但還沒有成功。資料庫是SQLite，程式是基于python的。

uj5u.com熱心網友回復：

在沒有完全理解你的代碼之前，很難給出一個準確的答案。我不太明白你在哪里迭代urls/ids/等等。你要在回圈外為更新和插入建立一個串列，然后將引數序列累積到它們相應的串列中。最后，在回圈之后，你將把每個串列與你想要執行的固定 SQL 一起傳遞給 executemany。

這應該能讓你了解它是如何通過回圈/迭代作業的。

。
    #...

    # These are each a list of tuples/lists.
    # 即 [(param0, ..., paramN), ..., (param0, ..., paramN)]
    params_to_update = []
    params_to_insert=[]

    # 對于當前抓取的每一個獨特的URL
    for row in current_crawl.itertuples()。

        #.../span>

            if (max_date != '0'):
                if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)) 。
                    # 情況 B.2
                    ending_date = max_date
                    ending_date_status = "精確"。
                    params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
                    開始日期=最大日期
                    starting_date_status = "精確"。
                    ending_date = NoneNone 終止日期 = None
                    params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
                elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)) 。
                    # 情況 B.3
                    Ending_date = max_date
                    ending_date_status = "精確"。
                    params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
                    開始日期=最大日期
                    starting_date_status = "Exact""估計"
                    params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))

    # 在for回圈完成后。
    # 為這個串列中的每個引數序列呼叫UPDATE。
    UPDATE_SQL = ""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? WHERE Lifetime_ID=?""
    cur.executemany(UPDATE_SQL, params_to_update)
    # 為這個串列中的每個引數序列呼叫INSERT。
    INSERT_SQL = ""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?)""
    cur.executemany(INSERT_SQL, params_to_insert)

stackoverflow.com。using-executemany-to-updateentries-in-existing-sqlite3-databaseusing-pyt

docs.python.org: python doc executemany 示例

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/309815.html

標籤：

上一篇：SQLKata與SQLite的最小例子（Powershell）。

下一篇：SQL-根據其他兩列的值從一列中獲取最小值