수업정리/Fundamental

[Python] 한글 유니코드 5 - 한글 decompose 해서 모음,자음 넣기

GreenBNN 2023. 5. 19. 14:23

이전에 우리는 cc_han_list.py 로 한 글자씩 세어서 sqlite3 파일에 넣는 것을 했다.

이번에는 한 글자를 분해해서 자음,모음의 개수를 세어 넣어 보겠다.

지금까지 정리

python cc_han_list.py > han_character.txt  :  한글 텍스트를 한 글자씩 세어서 저장

python han_sqldb.py   :  우리가 만든 han_character.txt 를  hangulDB.sqlite3 에 삽입


cl_han_list.py

                                                                                                              import glob
 from collections import Counter
 from draw_bar import draw_bar
 from Han_fonts import set_Han_font
 from Hangul import *

 def read_all(file_list):
     data = []
     for f in file_list:
         ifile = open(f, 'r', encoding='utf-8')
         data += ifile.read()
         ifile.close()
     return data

 #print(all_data)
 #add codes for counting all words
 def counts(data, top=None):
     wc_list = Counter(data).most_common(top)
     w_list, c_list = zip(*wc_list)
     return w_list, c_list

 def is_hangul_chr(char):
     #'가' ~ '힣'
     if 0xac00 <= ord(char) <= 0xd7ac : return True
     return False


 f_list = glob.glob('news_data/*')
 all_data=read_all(f_list)
 all_han_data = [sylla for sylla in all_data if is_hangul_chr(sylla) ]
 h = Hangul()
 #all_han_letters = [letter for sylla in all_han_data for letter in h.decompose(sylla)]
 all_han_letters = [letter for sylla in all_han_data for letter in h.decompose(sylla) if letter != '']
 #wlist, clist = counts(all_han_data)
 wlist, clist = counts(all_han_letters)
 #print(wlist)
 #print(clist)
 for w, c in zip(wlist, clist):
     print(f"{w}\t{c}")

우리의 한글데이터를 한 글자씩 가져와서 decompose 하고 그것들을 세어서 han_letters.txt 에 (wlist, clist) 을 저장한다.

나중에 똑같이 python han_sqldb.py 해서 han_letters.txt 를 hangulDB.sqlite3 에 저장해주면 된다.

 han_sqldb.py 

#coding:utf-8
 import sqlite3
 import sys

 # if dbname exist, it is connects to. if noexist, it will be created.
 def connect_db(dbname):
     conn = sqlite3.connect(dbname)
     return conn
 def insert_characters(conn, infilename):
     ifile = open (infilename, 'r', encoding='utf-8')
     data = ifile.readlines() # 1 line is one item: \t123 (character\tfrequency)
     ifile.close()

     cur = conn.cursor()
     tsql = '''create table if not exists han_characters
     ( character text not null unique, freq integer not null );'''

     print(tsql)
     cur.execute(tsql) # run 'create table ...' sql.

     sqls=[] # for all insert sqls
     for line in data: # a line is a record item
         line = line.strip() # strip out empty characters ('\n',' ','\t')
         if line=='': continue # ignore if it's empty line
         char, freq = tuple (line.split()) # split by space and take character and freq
         a_sql = f"insert into han_characters (character, freq) values ('{char}', {freq});"
         sqls.append(a_sql)

         for sql in sqls:
             print(sql)
             try: #since a single sql might not be working because of duplicate (not unique,...)
                 cur.execute(sql) #run 'insert table (fields...) values(..)'
             except Exception as e:
                 print(e, file=sys.stderr)
                 pass
 def insert_letters (conn, infilename):
     ifile = open(infilename, 'r', encoding='utf-8')
     data = ifile.readlines()
     ifile.close()
     cur = conn.cursor()
     tsql = '''create table if not exists han_letters (letter text not null unique, freq integer not null ); '''

     print(tsql)
     cur.execute(tsql)

     sqls=[ ]
     for line in data:
         line =line.strip() # strip out empty characters ('\n', '', '\t')
         if line == '' : continue # ignore if it's empty line
         char, freq = tuple(line.split()) # split by space and take one by one
         a_sql = f"insert into han_letters (letter, freq) values ('{char}', {freq})"
         sqls.append(a_sql)

     for sql in sqls:
         print(sql)
         try:
             cur.execute(sql)
         except Exception as e:
             print(e, file=sys.stderr)

     conn.commit()

 if __name__ == "__main__":
     try:
         conn=connect_db('hangulDB.sqlite3')
         insert_characters(conn, 'han_characters.txt')
         insert_letters(conn, 'han_letters.txt')
         #insert_letters(conn, 'han_all_letters.txt')
     except Exception as e:
         print(e)

Hangul.py 에서 조금 더 특수문자나 오래된 한글 자음같은거 추가해주기

위 부분 코드 수정해주면 된다.


python han_sqldb.py 1> normal.logs 2> errors.logs

이거는 우리가 stderr 처리를 해줄 때 평험한 출력인 경우 1>normal.logs 에 저장하고

오류인 경우 2>errors.logs 에 저장하는 코드이다.