|
- module Similarity
- class Calc
- def self.call model, tgt
- similarity_model = "#{ model.name }Similarity".constantize
-
- # 最大保存件数
- n = 20
-
- similarity_model.delete_all
-
- posts = model.includes(tgt).select(:id).to_a
-
- tag_ids = { }
- tag_cnts = { }
-
- posts.each do |p|
- arr = p.public_send(tgt).map(&:id).sort
- tag_ids[p.id] = arr
- tag_cnts[p.id] = arr.size
- end
-
- intersection_size = -> a, b do
- i = 0
- j = 0
- cnt = 0
- while i < a.size && j < b.size
- a_i = a[i]
- b_j = b[j]
- if a_i == b_j
- cnt += 1
- i += 1
- j += 1
- elsif a_i < b_j
- i += 1
- else
- j += 1
- end
- end
- cnt
- end
-
- push_topk = -> list, cos, target_id do
- return if list.size >= n && cos <= list[-1][0]
-
- idx = nil
- list.each_with_index do |(c, tid), i|
- if tid == target_id
- idx = i
- break
- end
- end
- if idx
- return if cos <= list[idx][0]
- list.delete_at(idx)
- end
-
- insert_at = list.size
- list.each_with_index do |(c, _), i|
- if cos > c
- insert_at = i
- break
- end
- end
- list.insert(insert_at, [cos, target_id])
- list.pop if list.size > n
- end
-
- top = Hash.new { |h, key| h[key] = [] }
-
- ids = posts.map(&:id)
- ids.each_with_index do |post_id, i|
- a = tag_ids[post_id]
- a_cnt = tag_cnts[post_id]
-
- ((i + 1)...ids.size).each do |j|
- target_id = ids[j]
- b = tag_ids[target_id]
- b_cnt = tag_cnts[target_id]
-
- norm = Math.sqrt(a_cnt * b_cnt)
- cos = norm.zero? ? 0.0 : intersection_size.(a, b).fdiv(norm)
-
- push_topk.(top[post_id], cos, target_id)
- push_topk.(top[target_id], cos, post_id)
- end
- end
-
- buf = []
- flush = -> do
- return if buf.empty?
- similarity_model.insert_all!(buf)
- buf.clear
- end
-
- top.each do |post_id, list|
- list.each do |cos, target_post_id|
- buf << { "#{ model.name.underscore }_id".to_sym => post_id,
- "target_#{ model.name.underscore }_id".to_sym => target_post_id,
- cos: }
- flush.call if buf.size >= 1_000
- end
- end
- flush.call
- end
- end
- end
|