리뷰 데이터 파일 읽고 데이터 전처리

def total_review_toknizer():
    file_path = "/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/text1.txt"
    okt = Okt()
    total_reviews = []
    with open(file_path) as f:
        lines = f.readlines()
    with open(file_path) as f:
        lines = f.readlines()

    for i in lines:
        total_reviews.append(i[2:].strip("\n"))

    okt = Okt()
    normalization_total_review = [] # 평점 1~3점

    # 문장 이상한거 수정 및 정규화 진행 전처리
    for review in total_reviews:  # 긍정리뷰
        clean_review = emoticon_normalize(review,num_repeats=3) #반복되는 이모티콘 정리 최대 3회
        clean_review = repeat_normalize(clean_review,num_repeats=3) # 반복되는 문구 정리 최대 3회
        clean_review = only_hangle(clean_review) # 리뷰중 영어 제외
        clean_review = okt.normalize(clean_review) # 정리
        normalization_total_review.append(clean_review)

    with open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/total_review.txt", "w") as positive_file_path:
        for i in total_reviews:
            positive_file_path.write(i+"\n")

    return normalization_total_review

전처리 된 데이터 형태소 태깅하기

def text_TAG(normalization_review):
    okt = Okt()

    pos_reviews = []
    for review in normalization_review: # 형태소 분석
        clean_review = okt.pos(review[2:], stem=True,join=True)

        pos_reviews.append(clean_review)
    # {'Adjective': '형용사',
    #  'Adverb': '부사',
    #  'Alpha': '알파벳',
    #  'Conjunction': '접속사',
    #  'Determiner': '관형사',
    #  'Eomi': '어미',
    #  'Exclamation': '감탄사',
    #  'Foreign': '외국어, 한자 및 기타기호',
    #  'Hashtag': '트위터 해쉬태그',
    #  'Josa': '조사',
    #  'KoreanParticle': '(ex: ㅋㅋ)',
    #  'Noun': '명사',
    #  'Number': '숫자',
    #  'PreEomi': '선어말어미',
    #  'Punctuation': '구두점',
    #  'ScreenName': '트위터 아이디',
    #  'Suffix': '접미사',
    #  'Unknown': '미등록어',
    #  'Verb': '동사'}
    tag_reviews = []
    for i in pos_reviews: #Adjective 동사, 형용사 가져오기
        for j in i:
            text_tag = j.split("/") #'편리하다/Adjective'
            if text_tag[1] == "Adjective" :
                tag_reviews.append(text_tag[0])
    total_tag_reviews =[]
    #
    count_tag_reviews = Counter(tag_reviews)
    return count_tag_reviews

리뷰 데이터 토크나이저 진행

def text_tokenizer(): #리뷰 데이터 토크나이저 진행
    okt = Okt()
    f = open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/negative_review.txt", 'r') # 기본 긍,부정 txt
    lines = f.readlines()
    total_text = []
    for line in lines:
        line = line.strip()  # 줄 끝의 줄 바꿈 문자를 제거한다.
        line = okt.pos(line, stem=True, join=True)
        new_text = []
        for i in line:
            text_tag = i.split("/")  # '편리하다/Adjective'
            new_text.append(text_tag[0])
        total_text.append(new_text)
    f.close()
    with open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/negative_tokenizer_review.txt", "w") as file_path: # 긍,부정 리뷰 토큰화 진행
        for i in total_text:
            text = ""
            for j in i:
                text = text + j + " "
            file_path.write(text + "\n")

SO-PMI를 통해 기준단어를 바탕으로 계산

def word_tokenizer(count_reviews,state):
    okt = Okt()
    most_relateds = []
    pmi_dict = {}
    corpus_path = "/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/"+state+"_tokenizer_review.txt"
    corpus = DoublespaceLineCorpus(corpus_path,iter_sent=True) #corpus의 길이를 계산할 때, 문장 단위로 계산이 됨
    word_extractor = WordExtractor() #단어 추출기
    word_extractor.train(corpus)
    tokenizer = RegexTokenizer()

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=4,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)
    pmi, px, py = pmi_func(
        x,
        min_pmi=0,
        alpha=0.0001,
        beta=0.75,
    )
    vocab2idx = {vocab: idx for idx, vocab in enumerate(idx2vocab)}
    for text,cnt in count_reviews:

        query = vocab2idx[text]

        submatrix = pmi[query, :].tocsr()  # get the row of query
        contexts = submatrix.nonzero()[1]  # nonzero() return (rows, columns)
        pmi_i = submatrix.data

        most_relateds = [(idx, pmi_ij) for idx, pmi_ij in zip(contexts, pmi_i)]
        most_relateds = sorted(most_relateds, key=lambda x: -x[0])
        most_relateds = [(idx2vocab[idx], pmi_ij) for idx, pmi_ij in most_relateds]

        # pprint(most_relateds)
        for text,pmi_score in most_relateds: #pmi값중 text,pmi_score추출
            a = okt.pos(text)
            for pos_text, tag in a: #pmi계산 한 토큰 중 형용사만 추가.
                if tag == "Adjective":
                    if pos_text not in pmi_dict:
                        pmi_dict[pos_text] = pmi_score
                    pmi_dict[pos_text] += pmi_score
    # pmi_dict = sorted(pmi_dict.items(), key=lambda x:x[1], reverse= True)

    return pmi_dict

긍,부정 dictionary 구축

def calculateSOPMI(positive_pmi_dict, negative_pmi_dict):
    result = []
    for key, _ in positive_pmi_dict.items():
        if key in negative_pmi_dict:
            so_pmi = positive_pmi_dict[key] - negative_pmi_dict[key]
            result.append((key, so_pmi))

    return result

전체 코드


from collections import Counter
import time
import json
from konlpy.tag import Okt
from soynlp.vectorizer import sent_to_word_contexts_matrix
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.normalizer import *
from soynlp.word import pmi as pmi_func
from soynlp.tokenizer import RegexTokenizer
from pprint import pprint
import pandas as pd
import numpy as np
#from hanspell import spell_checker # 맞춤법 & 띄어쓰기 교정  //// 교정된 문장 = spell_checker.check(교정 전 문장).checked
#from soynlp.normalizer import * # 의미없게 반복되는 글자 교정 ////  교정된 문장 = emoticon_normalize(교정 전 문장', num_repeats=2) & 교정된 문장 = repeat_normalize(교정 전 문장, num_repeats=2)


class KnuSL():

    def data_list(wordname):
        with open('data/SentiWord_info.json', encoding='utf-8-sig', mode='r') as f:
            data = json.load(f)
        result = ['None', 'None']
        for i in range(0, len(data)):
            if data[i]['word'] == wordname:
                result.pop()
                result.pop()
                result.append(data[i]['word_root'])
                result.append(data[i]['polarity'])

        r_word = result[0]
        s_word = result[1]

        print('어근 : ' + r_word)
        print('극성 : ' + s_word)

        return r_word, s_word


# if __name__ == "__main__":
#
#     ksl = KnuSL
#
#     print("\nKNU 한국어 감성사전입니다~ :)")
#     print("사전에 단어가 없는 경우 결과가 None으로 나타납니다!!!")
#     print("종료하시려면 #을 입력해주세요!!!")
#     print("-2:매우 부정, -1:부정, 0:중립 or Unkwon, 1:긍정, 2:매우 긍정")
#     print("\n")
#
#     while (True):
#         wordname = input("word : ")
#         wordname = wordname.strip(" ")
#         if wordname != "#":
#             print(ksl.data_list(wordname))
#             print("\n")
#
#
#         elif wordname == "#":
#             print("\n이용해주셔서 감사합니다~ :)")
#             break


def setiment_Score(): # 감성어휘의 극성 값 계산
    high_review = ["맛있","저렴","착한"]
    total_SentimentScore = []
    ksl = KnuSL
    total_sum =0
    for ist in high_review:
        for word in ist:
            polarity_score_Ti = word.strip(" ")
            polarity_score = ksl.data_list(polarity_score_Ti)
            total_sum  += polarity_score
        SentimentScore = total_sum/len(ist) # 각 리뷰에 출현하는 감성 어휘의 극성 값
        total_SentimentScore.append(SentimentScore)
    return  total_SentimentScore



def fileread(): # 리뷰 데이터파일 읽고 긍정 부정 분류후 데이터 전처리

    file_path = "/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/text1.txt"
    positive_reviews = []
    negative_reviews = []
    total_reviews = []
    # o = open('/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/text1.txt', 'w')

    # with open(file_path) as t:
    #     lines = t.readlines()
    # for i in lines:
    #     i = i.replace("    ",",")
    #     o.write(i)
    # o.close()
    with open(file_path) as f:
        lines = f.readlines()

    for i in lines:
        if i[0] == "5" or i[0] == "4":
            positive_reviews.append(i[2:].strip("\n"))
        else:
            negative_reviews.append(i[2:].strip("\n"))
        total_reviews.append(i[2:].strip("\n"))
    okt = Okt()
    normalization_positive_review = [] # 평점 1~3점
    normalization_negative_review = [] # 평점 4~5점
    # 문장 이상한거 수정 및 정규화 진행 전처리
    for review in positive_reviews:  # 긍정리뷰
        clean_review = emoticon_normalize(review,num_repeats=3) #반복되는 이모티콘 정리 최대 3회
        clean_review = repeat_normalize(clean_review,num_repeats=3) # 반복되는 문구 정리 최대 3회
        clean_review = only_hangle(clean_review) # 리뷰중 영어 제외
        clean_review = okt.normalize(clean_review) # 정리
        normalization_positive_review.append(clean_review)
    for review in negative_reviews: # 부정리뷰
        clean_review = emoticon_normalize(review, num_repeats=3)  # 반복되는 이모티콘 정리 최대 3회
        clean_review = repeat_normalize(clean_review, num_repeats=3)  # 반복되는 문구 정리 최대 3회
        clean_review = only_hangle(clean_review)  # 리뷰중 영어 제외
        clean_review = okt.normalize(clean_review)  # 정리
        normalization_negative_review.append(clean_review)
    with open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/positive_review.txt", "w") as positive_file_path:
        for i in positive_reviews:
            positive_file_path.write(i+"\n")
    with open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/negative_review.txt", "w") as negative_file_path:
        for i in negative_reviews:
            negative_file_path.write(i+"\n")

    return normalization_positive_review,normalization_negative_review

def total_review_toknizer():
    file_path = "/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/text1.txt"
    okt = Okt()
    total_reviews = []
    with open(file_path) as f:
        lines = f.readlines()
    with open(file_path) as f:
        lines = f.readlines()

    for i in lines:
        total_reviews.append(i[2:].strip("\n"))

    okt = Okt()
    normalization_total_review = [] # 평점 1~3점

    # 문장 이상한거 수정 및 정규화 진행 전처리
    for review in total_reviews:  # 긍정리뷰
        clean_review = emoticon_normalize(review,num_repeats=3) #반복되는 이모티콘 정리 최대 3회
        clean_review = repeat_normalize(clean_review,num_repeats=3) # 반복되는 문구 정리 최대 3회
        clean_review = only_hangle(clean_review) # 리뷰중 영어 제외
        clean_review = okt.normalize(clean_review) # 정리
        normalization_total_review.append(clean_review)

    with open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/total_review.txt", "w") as positive_file_path:
        for i in total_reviews:
            positive_file_path.write(i+"\n")

    return normalization_total_review

def text_TAG(normalization_review):
    okt = Okt()

    pos_reviews = []
    for review in normalization_review: # 형태소 분석
        clean_review = okt.pos(review[2:], stem=True,join=True)

        pos_reviews.append(clean_review)
    # {'Adjective': '형용사',
    #  'Adverb': '부사',
    #  'Alpha': '알파벳',
    #  'Conjunction': '접속사',
    #  'Determiner': '관형사',
    #  'Eomi': '어미',
    #  'Exclamation': '감탄사',
    #  'Foreign': '외국어, 한자 및 기타기호',
    #  'Hashtag': '트위터 해쉬태그',
    #  'Josa': '조사',
    #  'KoreanParticle': '(ex: ㅋㅋ)',
    #  'Noun': '명사',
    #  'Number': '숫자',
    #  'PreEomi': '선어말어미',
    #  'Punctuation': '구두점',
    #  'ScreenName': '트위터 아이디',
    #  'Suffix': '접미사',
    #  'Unknown': '미등록어',
    #  'Verb': '동사'}
    tag_reviews = []
    for i in pos_reviews: #Adjective 동사, 형용사 가져오기
        for j in i:
            text_tag = j.split("/") #'편리하다/Adjective'
            if text_tag[1] == "Adjective" :
                tag_reviews.append(text_tag[0])
    total_tag_reviews =[]
    #
    count_tag_reviews = Counter(tag_reviews)
    return count_tag_reviews
def text_tokenizer(): #리뷰 데이터 토크나이저 진행
    okt = Okt()
    f = open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/negative_review.txt", 'r') # 기본 긍,부정 txt
    lines = f.readlines()
    total_text = []
    for line in lines:
        line = line.strip()  # 줄 끝의 줄 바꿈 문자를 제거한다.
        line = okt.pos(line, stem=True, join=True)
        new_text = []
        for i in line:
            text_tag = i.split("/")  # '편리하다/Adjective'
            new_text.append(text_tag[0])
        total_text.append(new_text)
    f.close()
    with open("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/negative_tokenizer_review.txt", "w") as file_path: # 긍,부정 리뷰 토큰화 진행
        for i in total_text:
            text = ""
            for j in i:
                text = text + j + " "
            file_path.write(text + "\n")
def word_tokenizer(count_reviews,state):
    okt = Okt()
    most_relateds = []
    pmi_dict = {}
    corpus_path = "/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/"+state+"_tokenizer_review.txt"
    corpus = DoublespaceLineCorpus(corpus_path,iter_sent=True) #corpus의 길이를 계산할 때, 문장 단위로 계산이 됨
    word_extractor = WordExtractor() #단어 추출기
    word_extractor.train(corpus)
    tokenizer = RegexTokenizer()

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=4,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)
    pmi, px, py = pmi_func(
        x,
        min_pmi=0,
        alpha=0.0001,
        beta=0.75,
    )
    vocab2idx = {vocab: idx for idx, vocab in enumerate(idx2vocab)}
    for text,cnt in count_reviews:

        query = vocab2idx[text]

        submatrix = pmi[query, :].tocsr()  # get the row of query
        contexts = submatrix.nonzero()[1]  # nonzero() return (rows, columns)
        pmi_i = submatrix.data

        most_relateds = [(idx, pmi_ij) for idx, pmi_ij in zip(contexts, pmi_i)]
        most_relateds = sorted(most_relateds, key=lambda x: -x[0])
        most_relateds = [(idx2vocab[idx], pmi_ij) for idx, pmi_ij in most_relateds]

        # pprint(most_relateds)
        for text,pmi_score in most_relateds: #pmi값중 text,pmi_score추출
            a = okt.pos(text)
            for pos_text, tag in a: #pmi계산 한 토큰 중 형용사만 추가.
                if tag == "Adjective":
                    if pos_text not in pmi_dict:
                        pmi_dict[pos_text] = pmi_score
                    pmi_dict[pos_text] += pmi_score
    # pmi_dict = sorted(pmi_dict.items(), key=lambda x:x[1], reverse= True)

    return pmi_dict

def calculateSOPMI(positive_pmi_dict, negative_pmi_dict):
    result = []
    for key, _ in positive_pmi_dict.items():
        if key in negative_pmi_dict:
            so_pmi = positive_pmi_dict[key] - negative_pmi_dict[key]
            result.append((key, so_pmi))

    return result

positive_normalization_review, negative_normalization_review = fileread() # 정제전 토탈 리뷰

total_review = total_review_toknizer() # 전처리

positive_count_reviews = text_TAG(positive_normalization_review)  # 정제후 품사태깅된 리뷰
negative_count_reviews = text_TAG(negative_normalization_review)

positive_count_reviews = positive_count_reviews - negative_count_reviews
negative_count_reviews = negative_count_reviews - positive_count_reviews
positive_count_reviews = positive_count_reviews.most_common(20)
negative_count_reviews = negative_count_reviews.most_common(20)
text_tokenizer()
print(positive_count_reviews)
print(negative_count_reviews)
positive_pmi_dict = word_tokenizer(positive_count_reviews,"total") # text_tokenizer() #리뷰 데이터 토크나이저 진행
negative_pmi_dict = word_tokenizer(negative_count_reviews,"total") # text_tokenizer() #리뷰 데이터 토크나이저 진행
# 않다, 같다
#
result = calculateSOPMI(positive_pmi_dict,negative_pmi_dict)

result = sorted(result,key=lambda x:x[1], reverse= True)
#
column_name = ['text', 'score']
df = pd.DataFrame(result,columns=column_name)

#      text     score
# 0      좋다  8.251249
# 1     빠르다  6.580951
# 2     가볍다  6.106031
# 3    튼튼하다  6.095806
# 4    만족하다  5.797576
df["score"] = (df["score"] - df["score"].mean())/df["score"].std()
# print(df)




data = np.array_split(df, 5) # -2,-1,0,1,2 점수화를 위해서 5단계로 분할.
# 좋지 않다. <=> 좋다.
# 편하게 뿌리는 스타일이 좋지않았습니다.
# 편하다 뿌리다 스타일 이  좋다 않다..  (2점) *-1
#좋다 않다 -> 좋다 -> good -> bad ->나쁘다.
# 않다. 못

for i in range(5): # pmi값 기준으로 점수화
    if i == 0:
        data[i]['score'] = 2
    elif i == 1:
        data[i]['score'] = 1
    elif i== 2:
        data[i]['score'] = 0
    elif i == 3:
        data[i]['score'] = -1
    elif i == 4:
        data[i]['score'] = -2

total_df = pd.concat([data[0],data[1],data[2],data[3],data[4]])
total_df.to_json("/Users/sunho99/PycharmProjects/python_Project/setiment_dictionary_project/SentiWord.json", orient = 'table',force_ascii=False,indent=4)

저작자표시 비영리 변경금지

'AI > AI Project' 카테고리의 다른 글

covid 데이터 선형회귀 (0)	2023.09.13
Spark MLLib Titanic Data - MulticlassClassification (0)	2023.09.12
Yolov5모델로 쓰레기 탐지하기 (0)	2023.09.12
Transformer로 spam 데이터 판별 (0)	2023.09.10
LSTM으로 spam 데이터 판별 (0)	2023.09.07

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

한국어 리뷰 감정 사전 구축하기 using soynlp and konlyp

리뷰 데이터 파일 읽고 데이터 전처리

전처리 된 데이터 형태소 태깅하기

리뷰 데이터 토크나이저 진행

SO-PMI를 통해 기준단어를 바탕으로 계산

긍,부정 dictionary 구축

전체 코드

'AI > AI Project' 카테고리의 다른 글

리뷰 데이터 파일 읽고 데이터 전처리

전처리 된 데이터 형태소 태깅하기

리뷰 데이터 토크나이저 진행

SO-PMI를 통해 기준단어를 바탕으로 계산

긍,부정 dictionary 구축

전체 코드

'AI > AI Project' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

리뷰 데이터 파일 읽고 데이터 전처리

전처리 된 데이터 형태소 태깅하기

리뷰 데이터 토크나이저 진행

SO-PMI를 통해 기준단어를 바탕으로 계산

긍,부정 dictionary 구축

전체 코드

'AI > AI Project' 카테고리의 다른 글

리뷰 데이터 파일 읽고 데이터 전처리

전처리 된 데이터 형태소 태깅하기

리뷰 데이터 토크나이저 진행

SO-PMI를 통해 기준단어를 바탕으로 계산

긍,부정 dictionary 구축

전체 코드

'AI > AI Project' 카테고리의 다른 글

개인정보

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역