[DEV] 3주차. 파이썬으로 웹 다루기(5)

1. 기상청 날씨 스크래핑

https://www.weather.go.kr/w/weather/forecast/short-term.do

기온 정보 스크래핑

from selenium import webdriver
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.actions.action_builder import ActionBuilder
from selenium.webdriver import Keys, ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import time

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get("https://www.weather.go.kr/w/weather/forecast/short-term.do")
time.sleep(5)

temps = driver.find_element(By.ID, "my-tchart").text
temps = [int(i) for i in temps.replace('℃', '').split("\n")]
print(temps)

스크린샷 2023-10-27 오후 5 20 07

파란 부분을 스크래핑 한 결과

[15, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 10, 13, 14, 16]

오류

time.sleep() 을 하지 않으면 오류가 발생했음!
오류가 발생한다면 wait 타임을 가져볼 것

Line Plot으로 기온 추이 나타내기

import matplotlib.pyplot as plt

plt.ylim(min(temps)-2, max(temps)+2)

sns.lineplot(
    x = [i for i in range(len(temps))],
    y = temps
)

plt.show()

스크린샷 2023-10-27 오후 5 22 15

x의 범위에 비해 y의 범위가 좁기 때문에 더 정확한 그래프를 그리기 위해서 plt.ylim() 으로 y의 범위를 조절함

2. 해시코드 질문태그 빈도 시각화

https://hashcode.co.kr

스크린샷 2023-10-27 오후 5 23 58

이 태그들을 모아 빈도수를 시각화 할 것
- 1) <ul> 태그 모두 찾기
- 2) 1번 안에 있는 <li> 태그의 text 추출

요청

# User-Agent 추가

user_agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

# 요청 진행

import requests
from bs4 import BeautifulSoup

res = requests.get("https://hashcode.co.kr")
soup = BeautifulSoup(res.text, "html.parser")

ul_tags = soup.find_all("ul", "question-tags")
for ul in ul_tags:
    li_tags = ul.find_all("li")
    for li in li_tags:
        print(li.text.strip())

출력결과

python
class
python
python
mssql
migration
c#
c++
visualstudio
c++
deploy
nginx
stack
c++
token
iframe
react
javascript
css
border
html
selector
div
vscode
flutter
android
code
xcode
python
heapsort
c
python
operator
python
django
serializable
java
node.js
mysql
javascript
c
c++
python
turtle
c
sorting
mongodb
directx
pandas
dataframe
excel
c
scanf
ipc
openai
….

dictionary에 저장

import time

freq = {}

for i in range(1, 11):
    res = requests.get("https://hashcode.co.kr")
    soup = BeautifulSoup(res.text, "html.parser")

    ul_tags = soup.find_all("ul", "question-tags")
    for ul in ul_tags:
        li_tags = ul.find_all("li")
        for li in li_tags:
            tag = li.text.strip()
            if tag not in freq:
                freq[tag] = 1
            else:
                freq[tag] += 1
    time.sleep(1)
freq

출력결과

{‘python’: 120,
‘class’: 20,
‘mssql’: 10,
‘migration’: 10,
‘c#’: 30,
‘c++’: 50,
‘visualstudio’: 10,
‘deploy’: 10,
‘nginx’: 10,
‘stack’: 10,
‘token’: 10,
‘iframe’: 10,
‘react’: 10,
‘javascript’: 20,
‘css’: 20,
‘border’: 10,
‘html’: 20,
‘selector’: 10,
‘div’: 10,
‘vscode’: 10,
‘flutter’: 10,
‘android’: 10,
‘code’: 10,
‘xcode’: 10,
‘heapsort’: 10,
‘c’: 60,
‘operator’: 10,
‘django’: 10,
‘serializable’: 10,
‘java’: 10,
‘node.js’: 10,
‘mysql’: 10,
‘turtle’: 10,
‘sorting’: 10,
‘mongodb’: 10,
‘directx’: 10,
‘pandas’: 10,
‘dataframe’: 10,
‘excel’: 10,
‘scanf’: 10,
‘ipc’: 10,
‘openai’: 10,
‘cv2’: 10,
‘gui’: 10,
‘windows’: 20,
‘opencv’: 10,
‘interface’: 10,
‘inheritance’: 10,
‘back-end’: 10,
‘front-end’: 10,
‘spring-boot’: 10,
‘web’: 10,
‘security’: 10,
‘firewall’: 10,
‘selenium’: 10,
‘beautifulsoup’: 10,
‘urllib’: 10,
‘unity’: 10,
‘scraping’: 10,
‘requests’: 10,
‘spring’: 10,
‘floatig-point’: 10,
‘editor’: 10,
‘mfc’: 10,
‘pointer’: 10,
‘array’: 10,
‘visual-studio-2010’: 10}

Counter로 가장 빈도가 높은 value들 추출

from collections import Counter

counter = Counter(freq)
counter.most_common(10)

출력결과

[(‘python’, 120),
(‘c’, 60),
(‘c++’, 50),
(‘c#’, 30),
(‘class’, 20),
(‘javascript’, 20),
(‘css’, 20),
(‘html’, 20),
(‘windows’, 20),
(‘mssql’, 10)]

BarPlot 그리기

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plt.title("Frequency of quesion in Hashcode")
plt.xlabel("Tag")
plt.ylabel("Frequency")

x = [elem[0] for elem in counter.most_common(10)]
y = [elem[1] for elem in counter.most_common(10)]

sns.barplot(x=x, y=y)
plt.show()

스크린샷 2023-10-27 오후 5 33 44

3. Wordcloud

wordcloud 만드는 방법

1) KoNLPy 라이브러리로 한국어 문장 전처리
2) Counter 이용해 빈도수 측정
3) WordCloud 이용해 시각화

준비

# 시각화에 쓰이는 라이브러리
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 횟수를 기반으로 딕셔너리 생성
from collections import Counter

# 문장에서 명사를 추출하는 형태소 분석 라이브러리
from konlpy.tag import Hannanum

# 워드클라우드에 사용할 애국가 가사
national_anthem = """
동해물과 백두산이 마르고 닳도록
하느님이 보우하사 우리나라 만세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
남산 위에 저 소나무 철갑을 두른 듯
바람 서리 불변함은 우리 기상일세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
가을 하늘 공활한데 높고 구름 없이
밝은 달은 우리 가슴 일편단심일세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
이 기상과 이 맘으로 충성을 다하여
괴로우나 즐거우나 나라 사랑하세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
"""

명사 추출

# Hannanum 객체를 생성한 후, .nouns()를 통해 명사를 추출합니다.

hannanum = Hannanum()
nouns = hannanum.nouns(national_anthem)
words = [noun for noun in nouns if len(noun) > 1]

words[:10]

[‘동해물’, ‘백두산’, ‘하느님’, ‘보우하사’, ‘우리나라’, ‘무궁화’, ‘삼천리’, ‘화’, ‘강산’, ‘사람’]

명사 개수 세어 시각화

counter = Counter(words)

# 한글 font_path 설정
wordcloud = WordCloud(
    font_path="/Users/bokyung/Library/Fonts/malgun.ttf",
    background_color="white",
    width=1000,
    height=1000
)
img = wordcloud.generate_from_frequencies(counter)

plt.imshow(img)

스크린샷 2023-10-27 오후 5 53 54

4. Hashcode 질문 키워드 wordcloud

질문 제목 모아오기

user_agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

import time
import requests
from bs4 import BeautifulSoup

questions = []

for i in range(1, 6):
    res = requests.get("https://hashcode.co.kr/?pages={}".format(i), {"User-Agent": user_agent})
    soup = BeautifulSoup(res.text, "html.parser")
    
    parsed_datas = soup.find_all("li", "question-list-item")
    for data in parsed_datas:
        questions.append(data.find("div", "question").find("div", "top").h4.text)
    
    time.sleep(0.5)

명사 추출

# 시각화에 쓰이는 라이브러리
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 횟수를 기반으로 딕셔너리 생성
from collections import Counter

# 문장에서 명사를 추출하는 형태소 분석 라이브러리
from konlpy.tag import Hannanum

hannanum = Hannanum()

words = []

for question in questions:
    nouns = hannanum.nouns(question)
    words += nouns

print(len(words))

1096

단어 개수 세고 시각화

counter = Counter(words)

counter : dictionary 형태로 단어와 빈도수 저장

wordcloud = WordCloud(
    font_path="/Users/bokyung/Library/Fonts/malgun.ttf",
    background_color="white",
    height=1000,
    width=1000,
)

img = wordcloud.generate_from_frequencies(counter)

plt.imshow(img)

스크린샷 2023-10-27 오후 6 11 02