sqlite3 wiki.sqlite3
create table url_body (url text not null, body blob, date text not null, primary key(url));
alter table wiki_links add column fk_rul references url_body(url);
scrape_wiki_random_blob.py
#code:utf-8
from crawl_wiki2 import find_first_link, continue_crawl
#main
import time
import urllib.parse # to decode any URL string
from datetime import datetime
from wiki_sqldb import connect_db, insert_data
import sys
import sqlite3
conn = connect_db('wiki.sqlite3')
start_url = "https://ko.wikipedia.org/wiki/Special:Random"
target_url = "https://ko.wikipedia.org/wiki/한글"
input('start:%s' %start_url)
input('target:%s' %target_url)
article_chain = [start_url]
while continue_crawl(article_chain, target_url):
print('current last', article_chain[-1])
from_url = urllib.parse.unquote(article_chain[-1])
print('decoded', from_url) #
try:
first_link,body_text = find_first_link(article_chain[-1])
if first_link==None:
print("We've arrived at an article with no links, aborting search!")
break
article_chain.append(first_link) # Update the last url
time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
#table schema: url_body ( url text, body blob, date text)
body_bytes = body_text.encode()
datestr = '{:%Y%m%d%H%M%S}'.format(datetime.now())
cur = conn.cursor() #XXX
#XXX: Besure, when value is bytes type, {value} is not working in f-stringofn sql.
sql = '''insert into url_body (url, body, date) values (?, ?, ?)'''
cur.execute(sql, (from_url, body_bytes, datestr))
conn.commit()
input('see?') #enter to go next
except Exception as e:
print(e, file=sys.stderr)
conn.close()
~
'수업정리 > Fundamental' 카테고리의 다른 글
[Python] 한글 유니코드 12 - 기말 (0) | 2023.06.16 |
---|---|
[Python] 한글 유니코드 10 - (0) | 2023.06.09 |
[Python] 한글 유니코드 9 - urllib.py, scrape_wiki_random.py, 위키에서 읽고 데이터 받기? (0) | 2023.06.07 |
[Python] 한글 유니코드 9 - 질의처리(select, from, where), 쿼리문 파이썬 파일만들기 (0) | 2023.06.02 |
[Python] 한글 유니코드 8 - pandas, read_excel(), to_sql() (0) | 2023.05.31 |