빅데이터/Python

웹크롤링을 이용하여 실시간 도서 인기순위 출력

아임블로거 2023. 4. 9. 22:46

from django.http import HttpResponse
from django.shortcuts import render
from selenium import webdriver

from trend_book_app.views import get_main_trends

try:
    import BeautifulSoup as bs
except:
    from bs4 import BeautifulSoup as bs
from selenium.webdriver.chrome.options import Options

# Create your views here.

def homepage(request):
    # return HttpResponse('homepage')

    trending_list = get_main_trends()
    naver_list = get_naver_rank()
    yes24_list = get_yes24_rank()

    context = {
        'books': trending_list,
        'naver': naver_list,
        'yes24': yes24_list
    }

    return render(request, 'book_rank_app/base.html', context)
    # return render(request,'book_rank_app/base_hyejin.html')


def set_headless_driver(url):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome('chromedriver', options=options)
    driver.get(url)
    return driver


def get_naver_rank():
    naver_list = []
    naver_url = 'https://series.naver.com/ebook/top100List.series'
    driver = set_headless_driver(naver_url)

    # options = webdriver.ChromeOptions()
    # options.add_argument('headless')
    # driver = webdriver.Chrome('chromedriver', options=options)
    # driver.get(naver_url)

    html = driver.page_source
    soup = bs(html, 'html.parser')
    base_url = 'https://series.naver.com'

    for i in range(10):
        row = int(i / 5 + 1)
        col = int(i % 5 + 1)
        link_str = f"div#content > div > ul:nth-child({row}) > li:nth-child({col}) > a"
        rank = soup.select('li> span.num > em')[i].text.strip()
        image_src = soup.select('ul> li > a > img')[i]['src']
        title = soup.select('ul > li > a > strong')[i].text.strip()
        author = soup.select('ul > li > a > span.writer')[i].text.strip()
        link = soup.select(link_str)[0]['href']

        # content > div > ul:nth-child(1) > li:nth-child(2) > a
        # content > div > ul:nth-child(2) > li:nth-child(1) > a
        # content > div > ul:nth-child(2) > li:nth-child(3) > a

        book_detail = {
            'rank': rank,
            'image_src': image_src,
            'title': title,
            'author': author,
            'link': base_url+link,
        }

        naver_list.append(book_detail)

    return naver_list


def get_yes24_rank():
    yes24_list = []
    yes24_url = ' http://www.yes24.com/Mall/Main/EBook/017?CategoryNumber=017' 
    driver = set_headless_driver(yes24_url)
    html = driver.page_source
    soup = bs(html, 'html.parser')
    base_url = 'http://www.yes24.com'

    for i in range(10):
        rank = soup.select('div.item_img > div > span > span > em')[i].text.strip()
        image_src = soup.select('div.item_img > div > span > span > a > em.img_bdr > img')[i]['data-original']
        title = soup.select('ul > li > div > div > div.info_row.info_name > a')[i].text.strip()
        author = soup.select('ul > li > div > div.item_info > div.info_row.info_pubGrp > span.info_auth')[i].text.strip()
        link = soup.select('ul > li > div > div > div.info_row.info_name > a')[i]['href']

        book_detail = {
            'rank': rank,
            'image_src': image_src,
            'title': title,
            'author': author,
            'link': base_url+link,
        }

        yes24_list.append(book_detail)

    return yes24_list

'빅데이터 > Python' 카테고리의 다른 글

[Python] import matplotlib.pyplot as plt 에러 문제  (0) 2022.12.22
파이선 맛 보기  (0) 2022.12.19