ぼざろクリーチャーシリーズ DB 兼 API(自分用)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

394 lines
13 KiB

  1. # pylint: disable = missing-class-docstring
  2. # pylint: disable = missing-function-docstring
  3. """
  4. 日次で実行し,ぼざクリ DB を最新に更新する.
  5. """
  6. from __future__ import annotations
  7. import json
  8. import logging
  9. import random
  10. import string
  11. import time
  12. import unicodedata
  13. from datetime import date, datetime, timedelta
  14. from typing import Any, TypedDict, cast
  15. import jaconv
  16. import requests
  17. from db.config import DB
  18. from db.models import (Comment,
  19. Tag,
  20. TrackedVideo,
  21. User,
  22. Video,
  23. VideoHistory,
  24. VideoTag)
  25. logger = logging.getLogger (__name__)
  26. logging.basicConfig (
  27. level = logging.INFO,
  28. format = '%(asctime)s %(levelname)s %(message)s')
  29. class SearchNicoResult (TypedDict):
  30. videos: list['VideoResult']
  31. is_complete: bool
  32. class UpdateContext (TypedDict):
  33. api_data: list['VideoResult']
  34. comments_by_video_code: dict[str, list['CommentResult']]
  35. deletable: bool
  36. class VideoSearchParam (TypedDict):
  37. q: str
  38. targets: str
  39. _sort: str
  40. fields: str
  41. _limit: int
  42. jsonFilter: str
  43. class VideoResult (TypedDict):
  44. contentId: str
  45. userId: int | None
  46. title: str
  47. tags: str
  48. description: str | None
  49. viewCounter: int
  50. startTime: str
  51. class CommentResult (TypedDict):
  52. no: int
  53. userId: str
  54. body: str
  55. postedAt: str
  56. nicoruCount: int
  57. vposMs: int
  58. def main (
  59. ) -> None:
  60. now = datetime.now ()
  61. today = now.date ()
  62. search_result = search_nico_by_tags (['伊地知ニジカ',
  63. 'ぼざろクリーチャーシリーズ',
  64. 'ぼざろクリーチャーシリーズ外伝'])
  65. comments_by_video_code = fetch_comments_by_video_code (search_result['videos'])
  66. context: UpdateContext = { 'api_data': search_result['videos'],
  67. 'comments_by_video_code': comments_by_video_code,
  68. 'deletable': search_result['is_complete'] }
  69. connection = DB.connection ()
  70. connection.begin_transaction ()
  71. try:
  72. update_tables (context, now, today)
  73. connection.commit ()
  74. except Exception:
  75. connection.rollback ()
  76. raise
  77. def update_tables (
  78. context: UpdateContext,
  79. now: datetime,
  80. today: date,
  81. ) -> None:
  82. alive_video_codes: list[str] = []
  83. for datum in context['api_data']:
  84. tag_names = datum['tags'].split ()
  85. normalised_tag_names = {normalise (tag_name) for tag_name in tag_names}
  86. user: User | None = None
  87. if datum['userId'] is not None:
  88. user = User.where ('code', str (datum['userId'])).first ()
  89. if user is None:
  90. user = User ()
  91. user.code = str (datum['userId'])
  92. user.save ()
  93. video = Video ()
  94. video.code = datum['contentId']
  95. video.user_id = user.id if user else None
  96. video.title = datum['title']
  97. video.description = datum['description'] or ''
  98. video.uploaded_at = datetime.fromisoformat (datum['startTime'])
  99. video.deleted_at = None
  100. video.upsert ()
  101. alive_video_codes.append (video.code)
  102. video_history = VideoHistory ()
  103. video_history.video_id = video.id
  104. video_history.fetched_at = today
  105. video_history.views_count = datum['viewCounter']
  106. video_history.upsert ()
  107. video_tags = [video_tag for video_tag in video.video_tags
  108. if video_tag.untagged_at is None]
  109. for video_tag in video_tags:
  110. tag = video_tag.tag
  111. if tag is None:
  112. continue
  113. if normalise (tag.name) in normalised_tag_names:
  114. continue
  115. video_tag.untagged_at = today
  116. video_tag.save ()
  117. for tag_name in tag_names:
  118. tag = Tag.where ('name', tag_name).first ()
  119. if tag is None:
  120. tag = Tag ()
  121. tag.name = tag_name
  122. tag.save ()
  123. video_tag = (VideoTag.where ('video_id', video.id)
  124. .where ('tag_id', tag.id)
  125. .first ())
  126. if video_tag is None:
  127. video_tag = VideoTag ()
  128. video_tag.video_id = video.id
  129. video_tag.tag_id = tag.id
  130. video_tag.tagged_at = getattr (video_tag, 'tagged_at', None) or today
  131. video_tag.untagged_at = None
  132. video_tag.upsert ()
  133. for com in context['comments_by_video_code'].get (video.code, []):
  134. user = User.where ('code', com['userId']).first ()
  135. if user is None:
  136. user = User ()
  137. user.code = com['userId']
  138. user.save ()
  139. comment = Comment ()
  140. comment.video_id = video.id
  141. comment.comment_no = com['no']
  142. comment.user_id = user.id
  143. comment.content = com['body']
  144. comment.posted_at = datetime.fromisoformat (com['postedAt'])
  145. comment.nico_count = com.get ('nicoruCount', 0)
  146. comment.vpos_ms = com.get ('vposMs', 0)
  147. comment.upsert ()
  148. if not context['deletable']:
  149. logger.warning ('skip soft-delete because the latest fetch was incomplete')
  150. return
  151. if not alive_video_codes:
  152. logger.warning ('skip soft-delete because no alive videos were fetched')
  153. return
  154. videos = (Video.where_not_in ('code', alive_video_codes)
  155. .where_null ('deleted_at')
  156. .get ())
  157. for video in videos:
  158. video.deleted_at = now
  159. video.save ()
  160. def fetch_video_data (
  161. video_code: str,
  162. ) -> dict[str, Any]:
  163. time.sleep (1.2)
  164. headers = { 'X-Frontend-Id': '6',
  165. 'X-Frontend-Version': '0' }
  166. action_track_id = (
  167. ''.join (random.choice (string.ascii_letters + string.digits)
  168. for _ in range (10))
  169. + '_'
  170. + str (random.randrange (10 ** 12, 10 ** 13)))
  171. url = (f'https://www.nicovideo.jp/api/watch/v3_guest/{ video_code }'
  172. + f'?actionTrackId={ action_track_id }')
  173. return requests.post (url, headers = headers, timeout = 60).json ()
  174. def fetch_comments_by_video_code (
  175. videos: list[VideoResult],
  176. ) -> dict[str, list[CommentResult]]:
  177. comments_by_video_code: dict[str, list[CommentResult]] = {}
  178. for video in videos:
  179. video_code = video['contentId']
  180. try:
  181. comments_by_video_code[video_code] = fetch_comments (video_code)
  182. except (KeyError,
  183. TypeError,
  184. ValueError,
  185. requests.RequestException) as exc:
  186. logger.warning ('failed to fetch comments: %s (%s)', video_code, exc)
  187. comments_by_video_code[video_code] = []
  188. return comments_by_video_code
  189. def fetch_comments (
  190. video_code: str,
  191. ) -> list[CommentResult]:
  192. video_data = fetch_video_data (video_code)
  193. nv_comment = (video_data.get ('data', {})
  194. .get ('comment', {})
  195. .get ('nvComment'))
  196. if nv_comment is None:
  197. return []
  198. headers = { 'X-Frontend-Id': '6',
  199. 'X-Frontend-Version': '0',
  200. 'Content-Type': 'application/json' }
  201. params = { 'params': nv_comment['params'],
  202. 'additionals': {},
  203. 'threadKey': nv_comment['threadKey'] }
  204. url = nv_comment['server'] + '/v1/threads'
  205. response = requests.post (url,
  206. json = params,
  207. headers = headers,
  208. timeout = 60)
  209. response.raise_for_status ()
  210. res = response.json ()
  211. return select_comments_from_threads (res)
  212. def select_comments_from_threads (
  213. response: dict[str, Any],
  214. ) -> list[CommentResult]:
  215. threads = response.get ('data', {}).get ('threads', [])
  216. if not isinstance (threads, list):
  217. return []
  218. main_comments: list[CommentResult] = []
  219. fallback_comments: list[CommentResult] = []
  220. for thread in threads:
  221. comments = thread.get ('comments') if isinstance (thread, dict) else None
  222. if not isinstance (comments, list):
  223. continue
  224. casted_comments = cast (list[CommentResult], comments)
  225. if len (casted_comments) > len (fallback_comments):
  226. fallback_comments = casted_comments
  227. fork = str (thread.get ('fork', '')).lower ()
  228. label = str (thread.get ('label', '')).lower ()
  229. thread_id = str (thread.get ('id', '')).lower ()
  230. if fork == 'main' or 'main' in label or 'main' in thread_id:
  231. main_comments = casted_comments
  232. selected_comments = main_comments or fallback_comments
  233. deduped_comments: dict[int, CommentResult] = {}
  234. for comment in selected_comments:
  235. comment_no = comment.get ('no')
  236. if not isinstance (comment_no, int):
  237. continue
  238. deduped_comments[comment_no] = comment
  239. return [deduped_comments[comment_no]
  240. for comment_no in sorted (deduped_comments)]
  241. def search_nico_by_tags (
  242. tags: list[str],
  243. ) -> SearchNicoResult:
  244. today = datetime.now ()
  245. url = ('https://snapshot.search.nicovideo.jp'
  246. + '/api/v2/snapshot/video/contents/search')
  247. result_by_video_code: dict[str, VideoResult] = {}
  248. is_complete = True
  249. to = datetime (2022, 12, 3)
  250. while to <= today:
  251. time.sleep (1.2)
  252. until = to + timedelta (days = 14)
  253. # pylint: disable = consider-using-f-string
  254. query_filter = json.dumps ({ 'type': 'or',
  255. 'filters': [
  256. { 'type': 'range',
  257. 'field': 'startTime',
  258. 'from': ('%04d-%02d-%02dT00:00:00+09:00'
  259. % (to.year, to.month, to.day)),
  260. 'to': ('%04d-%02d-%02dT23:59:59+09:00'
  261. % (until.year, until.month, until.day)),
  262. 'include_lower': True }] })
  263. params: VideoSearchParam = { 'q': ' OR '.join (tags),
  264. 'targets': 'tagsExact',
  265. '_sort': '-viewCounter',
  266. 'fields': ('contentId,'
  267. 'userId,'
  268. 'title,'
  269. 'tags,'
  270. 'description,'
  271. 'viewCounter,'
  272. 'startTime'),
  273. '_limit': 100,
  274. 'jsonFilter': query_filter }
  275. try:
  276. response = requests.get (
  277. url,
  278. params = cast (dict[str, int | str], params),
  279. timeout = 60)
  280. response.raise_for_status ()
  281. res = response.json ()
  282. for datum in cast (list[VideoResult], res.get ('data', [])):
  283. result_by_video_code[datum['contentId']] = datum
  284. except (ValueError, requests.RequestException) as exc:
  285. logger.warning ('snapshot fetch failed: %s - %s (%s)',
  286. to.date (),
  287. until.date (),
  288. exc)
  289. is_complete = False
  290. to = until + timedelta (days = 1)
  291. for video in TrackedVideo.get ():
  292. if video.code in result_by_video_code:
  293. continue
  294. try:
  295. tracked_video = video
  296. video_data = fetch_video_data (tracked_video.code)['data']
  297. owner = video_data.get ('owner') or {}
  298. video_info = video_data['video']
  299. result_by_video_code[tracked_video.code] = {
  300. 'contentId': tracked_video.code,
  301. 'userId': owner.get ('id'),
  302. 'title': video_info['title'],
  303. 'tags': ' '.join (map (lambda t: t['name'],
  304. video_data['tag']['items'])),
  305. 'description': video_info['description'],
  306. 'viewCounter': video_info['count']['view'],
  307. 'startTime': video_info['registeredAt'] }
  308. except (KeyError,
  309. TypeError,
  310. ValueError,
  311. requests.RequestException) as exc:
  312. logger.warning ('tracked video fetch failed: %s (%s)', video.code, exc)
  313. is_complete = False
  314. return { 'videos': list (result_by_video_code.values ()),
  315. 'is_complete': is_complete }
  316. def normalise (
  317. text: str,
  318. ) -> str:
  319. return jaconv.hira2kata (
  320. unicodedata.normalize ('NFKC', text.strip ())).lower ()
  321. if __name__ == '__main__':
  322. main ()