웹크롤링을 이용하여 실시간 도서 인기순위 출력

빅데이터/Python

웹크롤링을 이용하여 실시간 도서 인기순위 출력

아임블로거 2023. 4. 9. 22:46

from django.http import HttpResponse
from django.shortcuts import render
from selenium import webdriver

from trend_book_app.views import get_main_trends

try:
    import BeautifulSoup as bs
except:
    from bs4 import BeautifulSoup as bs
from selenium.webdriver.chrome.options import Options

# Create your views here.

def homepage(request):
    # return HttpResponse('homepage')

    trending_list = get_main_trends()
    naver_list = get_naver_rank()
    yes24_list = get_yes24_rank()

    context = {
        'books': trending_list,
        'naver': naver_list,
        'yes24': yes24_list
    }

    return render(request, 'book_rank_app/base.html', context)
    # return render(request,'book_rank_app/base_hyejin.html')

def set_headless_driver(url):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome('chromedriver', options=options)
    driver.get(url)
    return driver

def get_naver_rank():
    naver_list = []
    naver_url = 'https://series.naver.com/ebook/top100List.series'
    driver = set_headless_driver(naver_url)

    # options = webdriver.ChromeOptions()
    # options.add_argument('headless')
    # driver = webdriver.Chrome('chromedriver', options=options)
    # driver.get(naver_url)

    html = driver.page_source
    soup = bs(html, 'html.parser')
    base_url = 'https://series.naver.com'

    for i in range(10):
        row = int(i / 5 + 1)
        col = int(i % 5 + 1)
        link_str = f"div#content > div > ul:nth-child({row}) > li:nth-child({col}) > a"
        rank = soup.select('li> span.num > em')[i].text.strip()
        image_src = soup.select('ul> li > a > img')[i]['src']
        title = soup.select('ul > li > a > strong')[i].text.strip()
        author = soup.select('ul > li > a > span.writer')[i].text.strip()
        link = soup.select(link_str)[0]['href']

        # content > div > ul:nth-child(1) > li:nth-child(2) > a
        # content > div > ul:nth-child(2) > li:nth-child(1) > a
        # content > div > ul:nth-child(2) > li:nth-child(3) > a

        book_detail = {
            'rank': rank,
            'image_src': image_src,
            'title': title,
            'author': author,
            'link': base_url+link,
        }

        naver_list.append(book_detail)

    return naver_list

def get_yes24_rank():
    yes24_list = []
    yes24_url = ' http://www.yes24.com/Mall/Main/EBook/017?CategoryNumber=017'
    driver = set_headless_driver(yes24_url)
    html = driver.page_source
    soup = bs(html, 'html.parser')
    base_url = 'http://www.yes24.com'

    for i in range(10):
        rank = soup.select('div.item_img > div > span > span > em')[i].text.strip()
        image_src = soup.select('div.item_img > div > span > span > a > em.img_bdr > img')[i]['data-original']
        title = soup.select('ul > li > div > div > div.info_row.info_name > a')[i].text.strip()
        author = soup.select('ul > li > div > div.item_info > div.info_row.info_pubGrp > span.info_auth')[i].text.strip()
        link = soup.select('ul > li > div > div > div.info_row.info_name > a')[i]['href']

        book_detail = {
            'rank': rank,
            'image_src': image_src,
            'title': title,
            'author': author,
            'link': base_url+link,
        }

        yes24_list.append(book_detail)

    return yes24_list

'빅데이터 > Python' 카테고리의 다른 글

[Python] import matplotlib.pyplot as plt 에러 문제 (0)	2022.12.22
파이선 맛 보기 (0)	2022.12.19

현재글웹크롤링을 이용하여 실시간 도서 인기순위 출력

과거에 내가 어떤 생각을 했고 무엇에 관심이 있었는지 미래에 꺼내보고싶어서 만든 블로그입니다.

파이썬, 빅데이터교육, 데이터엔지니어, 국비지원, 문제해결, 빅데이터센터, 창원, IT취업, 에러, matplotlib, 빅데이터, 빅데이터분석, 프로그래밍, 경상남도, 데이터엔지니어링, 데이터분석,

Today :
Yesterday :

일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30

넘쳐나는 내인생