Python(Colab) 자연어처리 + 데이터 전처리 연습

Python(Colab) 자연어처리 + 데이터 전처리 연습

2023. 7. 25. 23:05ㆍ파이썬/자연어 처리

1. 모듈 import

# 뉴스 기사 크롤링해주는 라이브러리

!pip install newspaper3k

import newspaper

# 지원하는 언어 보기

newspaper.languages()

from newspaper import Article

URL = 'https://v.daum.net/v/20230623113951712'

article = Article(URL,language='ko')

article.download()

article.parse()

print('title',article.title)

print('title',article.text)

해석

1. newspaper를 사용하기 위해 pip install newspaper3k
2. 설치 완료 후 import newspaper
3. from newspaper import Article

4. 그 후 원하는 기사 URL을 변수로 저장 후 다운로드

2. 불필요한 단어가 포함된 문장 추가하기

additional_info = [

"※ 기자 김사과(apple@apple.com) 취재 반하나(banana@banana.com)",

"<h2>톰 크루즈가 연기하는 아이언맨 '결사반대'한다는 로다주</h2>",

"이 기사는 임시 데이터임을 알림니다!",

"Copyrights© Pressian.com",

"<br> ☞ 이 기사는 문화 섹션으로 분류 했습니다 … </br>",

"#기사 #문화 #톰크루즈 #반대"

]

context = article.text.split('\n')

context += additional_info

for i,text in enumerate(context):

print(i,text)

해석

context라는 변수에 문장을 띄어쓰기로 나눈 것을 저장
그 후 불필요한 단어가 포함된 단어를 추가하고 출력

3. 불용어 처리하기

# 불용어 사전 정의

stopwords = ['이하', '바로', '☞', '※', '…']

# 불용어 제거 함수 (방법1)

def delete_stopwords(context):

preprocessed_text = []

for text in context:

text = [w for w in text.split(' ') if w not in stopwords]

preprocessed_text.append(' '.join(text))

return preprocessed_text

#방법2)

for i,text in enumerate(preprocessed_text):

for j in stopwords:

if context[i] in j:

context[i].replace(j,'')

# 불용어 제거 시키기

preprocessed_text = delete_stopwords(context)

#출력으로 확인

for i,text in enumerate(preprocessed_text):

print(i,text)

해석

1. 불용어로 처리할 단어를 stopwords에 담는다.
2. delete_stopwords 함수를 정의
리스트 컴프리헨션 [w for w in text.split('') if w not in stopwords]
3. 지워진 단어를 preprocessed_text를 담아서 return

4. html태그 지우기

import re

def delete_html_tag(context):

preprocessed_text = []

for text in context:

remove_text =re.sub(r'<[^>]+>','',text).strip() #re.sub(r'정규식',True일때, False일떄).strip()

preprocessed_text.append(remove_text)

return preprocessed_text

# 강사님 버전

def delete_html_tag(context):

preprocessed_text = []

for text in context:

text = re.sub(r'<[^>]+>\s+(?=<)|<[^>]+>', '', text).strip()

if text:

preprocessed_text.append(text)

return preprocessed_text

preprocessed_text = delete_html_tag(preprocessed_text)

for i,text in enumerate(preprocessed_text):

print(i,text)

해석

학원을 다니면서 강사님 버전과 내가 한 버전 두가지로 이루어져있다.

5. 문장 분리하기

✔️ 학습 데이터를 구성할 때 input 데이터 설정이 애매짐으로, 문장 단위로 모델이 학습하도록 유도하기 위해
문장분리가 중요하다!

한국어 문장 분리기 중 가장 성능이 우수한 것으로 알려진 kss라이브러리 사용 예정

https://github.com/hyunwoongko/kss

GitHub - hyunwoongko/kss: Kss: A Toolkit for Korean sentence segmentation

Kss: A Toolkit for Korean sentence segmentation. Contribute to hyunwoongko/kss development by creating an account on GitHub.

github.com

! pip install kss

import kss

def sectence_seperator(processed_context):

splited_context = []

for text in processed_context:

text = text.strip()

if text:

splited_text = kss.split_sentences(text)

splited_context.extend(splited_text)

return splited_context

splited_context = sectence_seperator(preprocessed_text)

for i,text in enumerate(splited_context):

print(i,text)

6. 이메일 제거하기

def delete_email(context):

preprocessed_text = []

for text in context:

text = re.sub(r'[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', text).strip()

if text:

preprocessed_text.append(text)

return preprocessed_text

preprocessed_context = delete_email(splited_context)

for i,text in enumerate(preprocessed_context):

print(i,text)

7. 해쉬태그 제거하기

def delete_hashtag(context):

preprocessed_text = []

for text in context:

text = re.sub(r'#\S+', '', text).strip()

if text:

preprocessed_text.append(text)

return preprocessed_text

preprocessed_context = delete_hashtag(preprocessed_context)

for i,text in enumerate(preprocessed_context):

print(i,text)

def delete_copyright(context):

re_patterns = [

r'(Copyrights)|(\(c\))|(\(C\))|©|(C)|'

]

preprocessed_text = []

for text in context:

for re_pattern in re_patterns:

text = re.sub(re_pattern, "", text).strip()

if text:

preprocessed_text.append(text)

return preprocessed_text

preprocessed_context = delete_hashtag(preprocessed_context)

for i,text in enumerate(preprocessed_context):

print(i,text)

반복 횟수가 많은 문자 정규화하기

!pip install soynlp

from soynlp.normalizer import *

#자음 모음에 한해서 바뀜

print(repeat_normalize('ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ',num_repeats=2))

print(repeat_normalize('야!!! 너!!! 지금 뭐함 ㅜㅜㅜㅜㅜㅜㅜㅜㅜㅜㅜㅜㅜㅜ',num_repeats=2))

데이터 후처리 Cleaning 하기

형태소 분석 기반 필터링

!pip install mecab-python

!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

from konlpy.tag import Mecab

객체 생성

mecab = Mecab()

morphs = mecab.pos('아버지가방에들어가신다', join=False)

print(morphs)

명사 동사 형용사 포함 여부 필터링하기

# 명사(NN), 동사(V), 형용사(J)의 포함 여부에 따라 문장 필터링

def morph_filter(context):

NN_TAGS = ['NNG', 'NNP', 'NNB', 'NP']

V_TAGS = ['VV', 'VA', 'VX', 'VCP', 'VCN', 'XSN', 'XSA', 'XSV']

J_TAGS = ['JKS', 'J', 'JO', 'JK', 'JKC', 'JKG', 'JKB', 'JKV', 'JKQ',

'JX', 'JC', 'JKI', 'JKO', 'JKM', 'ETM']

preprocessed_text = []

for text in context:

morphs = mecab.pos(text, join=False) #형태소 분석

nn_flag = False

v_flag = False

j_flag = False

print(morphs)

#완전한 문장인지 확인하는 곳(명사,형용사,동사의 유무)

for morph in morphs:

pos_tags = morph[1].split("+")

for pos_tag in pos_tags:

if not nn_flag and pos_tag in NN_TAGS:

nn_flag = True

if not v_flag and pos_tag in V_TAGS:

v_flag = True

if not j_flag and pos_tag in J_TAGS:

j_flag = True

if nn_flag and v_flag and j_flag:

preprocessed_text.append(text)

break

return preprocessed_text

# 완전적인 문장일때는 출력이 된다.

post_precessed_context = morph_filter(['아버지가방에들어가신다'])

for i,text in enumerate(post_precessed_context):

print(i,text)

#불완전한 문장이라면 출력이 안된다.

print('================')

post_precessed_context = morph_filter(['아버지가방'])

for i,text in enumerate(post_precessed_context):

print(i,text)

728x90

'파이썬 > 자연어 처리' 카테고리의 다른 글

Rasa 모델 (0)	2023.10.17
Python(Colab) Meta-Llama2 사용해보기 (2)	2023.10.05
Python(Colab) 자연어처리 + 데이터 전처리 개념 + 한국어 특성 (0)	2023.07.11
Python(Colab) 자연어처리 + HuggingFace 사용해보기 (0)	2023.07.11
Python(Colab) 자연어처리 (0)	2023.07.02

영차 영차

영차 영차

태그

최근글

댓글

공지사항

아카이브

반복 횟수가 많은 문자 정규화하기

데이터 후처리 Cleaning 하기

형태소 분석 기반 필터링

'파이썬 > 자연어 처리' 카테고리의 다른 글

관련글

티스토리툴바