module Similarity class Calc def self.call model, tgt similarity_model = "#{ model.name }Similarity".constantize # 最大保存件数 n = 20 similarity_model.delete_all posts = model.includes(tgt).select(:id).to_a tag_ids = { } tag_cnts = { } posts.each do |p| arr = p.public_send(tgt).map(&:id).sort tag_ids[p.id] = arr tag_cnts[p.id] = arr.size end intersection_size = -> a, b do i = 0 j = 0 cnt = 0 while i < a.size && j < b.size a_i = a[i] b_j = b[j] if a_i == b_j cnt += 1 i += 1 j += 1 elsif a_i < b_j i += 1 else j += 1 end end cnt end push_topk = -> list, cos, target_id do return if list.size >= n && cos <= list[-1][0] idx = nil list.each_with_index do |(c, tid), i| if tid == target_id idx = i break end end if idx return if cos <= list[idx][0] list.delete_at(idx) end insert_at = list.size list.each_with_index do |(c, _), i| if cos > c insert_at = i break end end list.insert(insert_at, [cos, target_id]) list.pop if list.size > n end top = Hash.new { |h, key| h[key] = [] } ids = posts.map(&:id) ids.each_with_index do |post_id, i| a = tag_ids[post_id] a_cnt = tag_cnts[post_id] ((i + 1)...ids.size).each do |j| target_id = ids[j] b = tag_ids[target_id] b_cnt = tag_cnts[target_id] norm = Math.sqrt(a_cnt * b_cnt) cos = norm.zero? ? 0.0 : intersection_size.(a, b).fdiv(norm) push_topk.(top[post_id], cos, target_id) push_topk.(top[target_id], cos, post_id) end end buf = [] flush = -> do return if buf.empty? similarity_model.insert_all!(buf) buf.clear end top.each do |post_id, list| list.each do |cos, target_post_id| buf << { "#{ model.name.underscore }_id".to_sym => post_id, "target_#{ model.name.underscore }_id".to_sym => target_post_id, cos: } flush.call if buf.size >= 1_000 end end flush.call end end end