수업정리/Fundamental
[Python] 한글 유니코드 5 - 한글 decompose 해서 모음,자음 넣기
GreenBNN
2023. 5. 19. 14:23
이전에 우리는 cc_han_list.py 로 한 글자씩 세어서 sqlite3 파일에 넣는 것을 했다.
이번에는 한 글자를 분해해서 자음,모음의 개수를 세어 넣어 보겠다.
지금까지 정리
python cc_han_list.py > han_character.txt : 한글 텍스트를 한 글자씩 세어서 저장
python han_sqldb.py : 우리가 만든 han_character.txt 를 hangulDB.sqlite3 에 삽입
cl_han_list.py
import glob
from collections import Counter
from draw_bar import draw_bar
from Han_fonts import set_Han_font
from Hangul import *
def read_all(file_list):
data = []
for f in file_list:
ifile = open(f, 'r', encoding='utf-8')
data += ifile.read()
ifile.close()
return data
#print(all_data)
#add codes for counting all words
def counts(data, top=None):
wc_list = Counter(data).most_common(top)
w_list, c_list = zip(*wc_list)
return w_list, c_list
def is_hangul_chr(char):
#'가' ~ '힣'
if 0xac00 <= ord(char) <= 0xd7ac : return True
return False
f_list = glob.glob('news_data/*')
all_data=read_all(f_list)
all_han_data = [sylla for sylla in all_data if is_hangul_chr(sylla) ]
h = Hangul()
#all_han_letters = [letter for sylla in all_han_data for letter in h.decompose(sylla)]
all_han_letters = [letter for sylla in all_han_data for letter in h.decompose(sylla) if letter != '']
#wlist, clist = counts(all_han_data)
wlist, clist = counts(all_han_letters)
#print(wlist)
#print(clist)
for w, c in zip(wlist, clist):
print(f"{w}\t{c}")
우리의 한글데이터를 한 글자씩 가져와서 decompose 하고 그것들을 세어서 han_letters.txt 에 (wlist, clist) 을 저장한다.
나중에 똑같이 python han_sqldb.py 해서 han_letters.txt 를 hangulDB.sqlite3 에 저장해주면 된다.
han_sqldb.py
#coding:utf-8
import sqlite3
import sys
# if dbname exist, it is connects to. if noexist, it will be created.
def connect_db(dbname):
conn = sqlite3.connect(dbname)
return conn
def insert_characters(conn, infilename):
ifile = open (infilename, 'r', encoding='utf-8')
data = ifile.readlines() # 1 line is one item: \t123 (character\tfrequency)
ifile.close()
cur = conn.cursor()
tsql = '''create table if not exists han_characters
( character text not null unique, freq integer not null );'''
print(tsql)
cur.execute(tsql) # run 'create table ...' sql.
sqls=[] # for all insert sqls
for line in data: # a line is a record item
line = line.strip() # strip out empty characters ('\n',' ','\t')
if line=='': continue # ignore if it's empty line
char, freq = tuple (line.split()) # split by space and take character and freq
a_sql = f"insert into han_characters (character, freq) values ('{char}', {freq});"
sqls.append(a_sql)
for sql in sqls:
print(sql)
try: #since a single sql might not be working because of duplicate (not unique,...)
cur.execute(sql) #run 'insert table (fields...) values(..)'
except Exception as e:
print(e, file=sys.stderr)
pass
def insert_letters (conn, infilename):
ifile = open(infilename, 'r', encoding='utf-8')
data = ifile.readlines()
ifile.close()
cur = conn.cursor()
tsql = '''create table if not exists han_letters (letter text not null unique, freq integer not null ); '''
print(tsql)
cur.execute(tsql)
sqls=[ ]
for line in data:
line =line.strip() # strip out empty characters ('\n', '', '\t')
if line == '' : continue # ignore if it's empty line
char, freq = tuple(line.split()) # split by space and take one by one
a_sql = f"insert into han_letters (letter, freq) values ('{char}', {freq})"
sqls.append(a_sql)
for sql in sqls:
print(sql)
try:
cur.execute(sql)
except Exception as e:
print(e, file=sys.stderr)
conn.commit()
if __name__ == "__main__":
try:
conn=connect_db('hangulDB.sqlite3')
insert_characters(conn, 'han_characters.txt')
insert_letters(conn, 'han_letters.txt')
#insert_letters(conn, 'han_all_letters.txt')
except Exception as e:
print(e)
Hangul.py 에서 조금 더 특수문자나 오래된 한글 자음같은거 추가해주기
위 부분 코드 수정해주면 된다.
python han_sqldb.py 1> normal.logs 2> errors.logs
이거는 우리가 stderr 처리를 해줄 때 평험한 출력인 경우 1>normal.logs 에 저장하고
오류인 경우 2>errors.logs 에 저장하는 코드이다.