본문 바로가기

수업정리/Fundamental

[Python] 한글 유니코드 11 -

sqlite3 wiki.sqlite3

create table url_body (url text not null, body blob, date text not null, primary key(url));

alter table wiki_links add column fk_rul references url_body(url);

 

scrape_wiki_random_blob.py

 #code:utf-8
 from crawl_wiki2 import find_first_link, continue_crawl

 #main
 import time
 import urllib.parse # to decode any URL string
 from datetime import datetime
 from wiki_sqldb import connect_db, insert_data
 import sys
 import sqlite3

 conn = connect_db('wiki.sqlite3')

 start_url = "https://ko.wikipedia.org/wiki/Special:Random"
 target_url = "https://ko.wikipedia.org/wiki/한글"
 input('start:%s' %start_url)
 input('target:%s' %target_url)

 article_chain = [start_url]

 while continue_crawl(article_chain, target_url):
     print('current last', article_chain[-1])
     from_url = urllib.parse.unquote(article_chain[-1])
     print('decoded', from_url) #

     try:
         first_link,body_text = find_first_link(article_chain[-1])
         if first_link==None:
             print("We've arrived at an article with no links, aborting search!")
             break

         article_chain.append(first_link) # Update the last url
         time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers

         #table schema: url_body ( url text, body blob, date text)
         body_bytes = body_text.encode()
         datestr = '{:%Y%m%d%H%M%S}'.format(datetime.now())

         cur = conn.cursor() #XXX
         #XXX: Besure, when value is bytes type, {value} is not working in f-stringofn sql.
         sql = '''insert into url_body (url, body, date) values (?, ?, ?)'''
         cur.execute(sql, (from_url, body_bytes, datestr))
         conn.commit()
         input('see?') #enter to go next
     except Exception as e:
         print(e, file=sys.stderr)

 conn.close()
~