在不使用Julia中的for回圈的情況下迭代字串向量的向量-有解無憂

給定字串向量的向量，例如：

sentences = [ ["Julia", "is", "1000x", "faster", "than", "Python!"], 
              ["Julia", "reads", "beautiful!"], 
              ["Python", "has", "600", "times", "more", "libraries"] 
]

我試圖過濾掉每個標記中的一些標記，而不會丟失外部向量結構（即，沒有將向量扁平化為單個標記串列）。

到目前為止，我已經使用經典的 for 回圈實作了這一點：

number_of_alphabetical_tokens = []
number_of_long_tokens = []
total_tokens = []

for sent in sentences
    append!(number_of_alphabetical_tokens, length([token for token in sent if all(isletter, token)]))
    append!(number_of_long_words, length([token for token in sent if length(token) > 2]))
    append!(total_tokens, length(sent))
end

collect(zip(number_of_alphabetical_tokens, number_of_long_words, total_tokens))

輸出：（根據@shayan 觀察編輯）

3-element Vector{Tuple{Any, Any, Any}}:
 (4, 5, 6)
 (2, 3, 3)
 (5, 6, 6)

這完成了作業，但它花費的時間比我想要的要多（我有 6000 多個檔案，每個檔案有數千個句子......），它看起來有點像反模式。

有沒有辦法通過理解或廣播（或任何更高效的方法）來做到這一點？

uj5u.com熱心網友回復：

一開始，我猜你在寫最終結果時有錯誤；例如，您7在 while 的第一個元素中寫了總令牌數，實際上sentences應該6是。
您可以按照這樣的程序，完全矢量化：

julia> sentences = [ ["Julia", "is", "1000x", "faster", "than", "Python!"],
                     ["Julia", "reads", "beautiful!"],
                     ["Python", "has", "600", "times", "more", "libraries"]
                   ];

julia> function check_all_letter(str::String)
           all(isletter, str)
       end
check_all_letter (generic function with 1 method)

julia> all_letters = map(x->filter(y->check_all_letter.(y), x), sentences)
3-element Vector{Vector{String}}:
 ["Julia", "is", "faster", "than"]
 ["Julia", "reads"]
 ["Python", "has", "times", "more", "libraries"]

julia> length.(a)
3-element Vector{Int64}:
 4
 2
 5

number_of_long_words我可以為和做一個類似的程序total_tokens。將所有這些包裝在一個函式中，我將擁有：

julia> function arbitrary_name(vec::Vector{Vector{String}})
           all_letters = map(x->filter(y->check_all_letter.(y), x), sentences)
           long_words = map(x->filter(y->length.(y).>2, x), sentences)
           total_tokens = length.(sentences)

           return collect(zip( length.(all_letters),
                               length.(long_words),
                               total_tokens
                             )
                   )
       end
arbitrary_name (generic function with 1 methods)

julia> arbitrary_name(sentences)
3-element Vector{Tuple{Int64, Int64, Int64}}:
 (4, 5, 6)
 (2, 3, 3)
 (5, 6, 6)

附加說明

當我寫類似的東西時length.(y).>2，事實上，我試圖通過矢量化鏈接一些 julia 函式。考慮這個例子來充分理解正在發生的事情length.(y).>2：

julia> vec = ["foo", "bar", "baz"];

julia> lengths = length.(vec)
3-element Vector{Int64}:
 3
 3
 3

julia> more_than_two = lengths .> 2
3-element BitVector:
 1
 1
 1

# This is exactly equal to this:
julia> length.(vec).>2
3-element BitVector:
 1
 1
 1

# Or
julia> vec .|> length .|> x->~isless(x, 2)
3-element BitVector:
 1
 1
 1

我希望這有助于@fandak ??。關于廣播和鏈接功能的進一步說明，請參閱官方檔案。

uj5u.com熱心網友回復：

在 Julia 中，出于性能原因，沒有理由避免回圈。回圈很快，矢量化代碼只是變相的回圈。

這是一個使用回圈和一些縮減的示例，例如alland count：

function wordstats(sentences)
    out = Vector{NTuple{3, Int}}(undef, length(sentences))
    for (i, sent) in pairs(sentences)
        a = count(all(isletter, word) for word in sent)
        b = count(length(word)>2 for word in sent)
        c = length(sent)
        out[i] = (a, b, c)
    end
    return out
end

上面的代碼并沒有優化，比如計算超過 2 的單詞可以改進，但是在我的筆記本電腦上運行時間大約是 700ns，這比向量化的解決方案要快得多。

編輯：這里的代碼基本相同，但使用了map do語法（所以你不必弄清楚回傳型別）：

function wordstats2(sentences)
    map(sentences) do sent
        a = count(all(isletter, word) for word in sent)
        b = count(length(word)>2 for word in sent)
        c = length(sent)
        return (a, b, c)
    end
end

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/516926.html

標籤：循环向量nlp朱莉娅阵列广播

上一篇：使用for回圈的羅馬數字生成器

下一篇：如何比較串列中的每個元素并檢查它是否大于右側的元素