Gutenberg project_Alice's Adventures in Wonderland

Data Analysis/텍스트마이닝

Gutenberg project_Alice's Adventures in Wonderland

뉴욕킴 2023. 11. 1. 23:11

1. Gutenberg project 접속

2. 텍스트 마이닝 하고 싶은 E-BOOK 고르기

3. 코드 짜보기

pip install wordcloud

import pandas as pd
import numpy as np
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image

import requests
from bs4 import BeautifulSoup
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 사이트에서 텍스트 추출
url = "https://www.gutenberg.org/cache/epub/11/pg11-images.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()

# 영문 텍스트의 관계 데이터 그리기 
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 텍스트 전처리
nltk.download('punkt')  # punkt 리소스 다운로드
nltk.download('stopwords')  # stopwords 리소스 다운로드
stop_words = set(stopwords.words('english'))
words = nltk.word_tokenize(text.lower())
words = [word for word in words if word.isalpha() and word not in stop_words]

# 단어 빈도 계산
word_counts = nltk.FreqDist(words)

# 워드클라우드 생성
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_counts)

# 워드클라우드 시각화
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()