본문 바로가기
프로젝트

퍼펫티어를 이용한 크롤링(1)

by 해룸 2024. 4. 11.

퍼펫티어 npm 설치

npm i puppeteer

 

페이지 띄우는 코드

 const url = ridibooks.ridiPage;
      const browser = await puppeteer.launch({
        headless: true,
      });
      const page = await browser.newPage();

 

로그인, 진짜 별거 없다.. 실제로 로그인 페이지를 띄워서 로그인 정보를 입력 후 클릭한다.

 async login(page: any, url: string) {
    const { pageBtn, submitBtn, textCursor1, textCursor2 } = ridibooks.login;

    await page.goto(url);
    await page.waitForSelector(pageBtn);
    await page.click(pageBtn);
    await page.waitForSelector(submitBtn);

    await page.click(textCursor1);
    await page.keyboard.type(process.env.RIDI_ID, { delay: 100 });
    await page.click(textCursor2);
    await page.keyboard.type(process.env.RIDI_PW, { delay: 100 });

    await page.click(submitBtn);
    await page.waitForNavigation({ waitUntil: 'networkidle0' });

    if (page.url() === url) return true;
  }

 

목록에서 상세페이지 링크를 배열로 만들기

  async getLinkList(page: any, type: string, url: string, maxPages: number) {
    let currentPage = 1;
    let linkList: any[] = [];

    const result = type === 'webNovels' ? 'romance_serial' : 'webtoon';

    while (currentPage <= maxPages) {
      const newUrl = `${url}/bestsellers/${result}?page=${currentPage}&order=daily`;
      console.log(`Start Crawling: ${newUrl}`);
      await page.goto(newUrl, { waitUntil: 'networkidle0' });

      await this.scrolling(page);

 if (type === 'webNovels') {
        const newLinks = await page.evaluate(() => {
          const items = Array.from(
            document.querySelectorAll(
              '#__next > main > section > ul.fig-1w8zspb > li',
            ),
          ).slice(0);

          return items.map((item) => {
            const link = item.querySelector('a')?.getAttribute('href');
            let rank = item.querySelector('div > div.fig-9njjsy')?.innerHTML;

            if (+rank > 20) {
              rank = null;
            }

            return { link, rank };
          });
        });

        linkList.push(...newLinks);
        currentPage += 1;
      } else {
        const newLinks = await page.evaluate(() => {
          const items = Array.from(
            document.querySelectorAll(
              '#__next > main > section > ul.fig-1w8zspb > li',
            ),
          );
          return items.map((item) => {
            const link = item.querySelector('a')?.getAttribute('href');
            let rank = item.querySelector('div > div.fig-9njjsy')?.innerHTML;

            if (+rank > 20) {
              rank = null;
            }

            return { link, rank };
          });
        });

        linkList.push(...newLinks);
        currentPage += 1;
      }
    }
    return linkList;
  }

 

상세페이지에 들어가서 본격적인 크롤링

 async scrapPostAndReview(rank: any, page: any, type: string) {
    try {
      await page.waitForNavigation({ waitUntil: 'networkidle2' });
      await page.waitForSelector(
        'div.header_thumbnail_wrap > div.header_thumbnail.book_macro_200.detail_scalable_thumbnail > div > div > div > img',
      );

      const url = page.url();
      const webContent = await page.evaluate(() => {
        const items = Array.from(
          document.querySelectorAll('#page_detail > div > div > section'),
        );

        return items.map((item) => {
          return {
            // isAdult: item.querySelector(
            //   '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_thumbnail_wrap > div.header_thumbnail.book_macro_200.detail_scalable_thumbnail > div > div > span',
            // )
            //   ? true
            //   : false,
            title: item.querySelector(
              '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div.info_title_wrap > h1',
            )?.textContent,
            img: item
              .querySelector(
                'div.header_thumbnail_wrap > div.header_thumbnail.book_macro_200.detail_scalable_thumbnail > div > div > div > img',
              )
              ?.getAttribute('src'),
            // score: item.querySelector(
            //   '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(3) > p > span > span.StarRate_Score',
            // )?.textContent,
            scoreParticipant: item.querySelector(
              '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(3) > p > span > span.StarRate_ParticipantCount',
            ).textContent,
            // userLikes: item.querySelector(
            //   '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_thumbnail_wrap > div.header_preference > button > span > span.button_text.js_preference_count',
            // )?.textContent,
            // status: item.querySelector(
            //   '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(4) > p.metadata.metadata_info_series_complete_wrap > span.metadata_item.not_complete',
            // )?.textContent,
            pubDate: item
              .querySelector(
                '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.Header_Metadata_Block > ul:nth-child(2) > li.Header_Metadata_Item.book_info.published_date_info > ul > li',
              )
              ?.textContent.trim(),
            genre: (() => {
              const genreList = Array.from(
                document.querySelectorAll(
                  '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > p > a',
                ),
              );
              const genres = [];
              Promise.all(
                genreList.map((item) => {
                  const genre = item?.textContent;
                  genres.push(genre);
                }),
              );
              const getGenres = new Set(genres);
              return [...getGenres];
            })(),
            keyword: (() => {
              const keywordList = Array.from(
                document.querySelectorAll(
                  '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_box_module.detail_keyword.js_detail_keyword_module > ul > li',
                ),
              );
              console.log(keywordList);
              const keywords = [];

              Promise.all(
                keywordList.map((item) => {
                  const keyword =
                    item.querySelector('button > span')?.textContent;
                  keywords.push(keyword);
                }),
              );
              return [...keywords];
            })(),
            author: (() => {
              const authorList = Array.from(
                document.querySelectorAll(
                  '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(4) > p.metadata.metadata_writer > span',
                ),
              );
              const authors = [];

              Promise.all(
                authorList.map((item) => {
                  const author = item?.textContent;
                  const modifiedAuthor = author.replace(/\s+/g, '/');
                  authors.push(modifiedAuthor);
                }),
              );
              return [...authors];
            })(),
            publish: item.querySelector(
              '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(4) > p.metadata.file_info.publisher_info > a',
            )?.textContent,
            dsc: (() => {
              const dscList = Array.from(
                document.querySelectorAll('#introduce_book'),
              );
              const dscs = [];
              Promise.all(
                dscList.map((item) => {
                  const dsc = item.querySelector('p')?.textContent;
                  dscs.push(dsc);
                }),
              );
              return [...dscs];
            })(),
          };
        });
      });

      let isAdult = false;
      if (type == 'webContents') {
        isAdult = webContent.some((content) => content.genre.includes('성인'));
      } else {
        isAdult = await page.$eval('.badge_adult', (element) => !!element);
      }

      const scrapDate = new Date().toLocaleDateString();

      const webContents = {
        url,
        rank,
        isAdult,
        webContent,
        scrapDate,
      };

      let contentTitle = webContent.map((content) => content.title);
      let reivewLength = webContent.map((content) => content.scoreParticipant);

      if (+reivewLength == 0) {
        return { contentTitle, webContents };
      }

      page.bringToFront();

      const reviews = await this.scrapReviews(contentTitle, page);

      return {
        contentTitle,
        webContents,
        reviews,
      };
    } catch (err) {
      console.log('❌❌❌❌', page.url(), err);
    }
  }

 

스크롤 함수

무한 스크롤이 되는 페이지 경우에는 사용자에게 빠른 응답을 주기 위해 모든 데이터를 갖고있는게 아닌

스크롤을 하며 유저가 페이지에 도달했을때 데이터를 보여준다. 그걸 위한 코드!

async scrolling(page: any) {
    await page.evaluate(async () => {
      await new Promise((resolve, reject) => {
        let totalHeight = 0;
        const distance = 250;
        const timer = setInterval(() => {
          const scrollHeight = document.documentElement.scrollHeight;
          window.scrollBy(0, distance);
          totalHeight += distance;
          if (totalHeight >= scrollHeight) {
            clearInterval(timer);
            resolve(true);
          }
        }, 100);
      });
    });
  }

 

이건.. 댓글을 자꾸 중복으로 가져와서 추가로 작성한 코드인데 어찌어찌 또 해결이 되서 사용하지 않는 코드이다.

// while (reviewList.length < 30) {
    //   await page.waitForSelector(
    //     '#review_list_section > div.review_list_wrapper.js_review_list_wrapper.active > ul > li',
    //     { waitUntil: 'networkidle0' },
    //   );

    //   await page.evaluate(async () => {
    //     await new Promise((resolve, reject) => {
    //       let totalHeight = 0;
    //       const distance = 100;
    //       const scrollStep = () => {
    //         window.scrollBy(0, distance);
    //         totalHeight += distance;
    //         const button = document.querySelector(
    //           '#review_list_section > div.review_list_wrapper.js_review_list_wrapper.active > ul > li',
    //         );
    //         if (button) {
    //           resolve(true);
    //           return;
    //         }
    //         if (!button) {
    //           resolve(true);
    //           return;
    //         }
    //         setTimeout(scrollStep, 3000);
    //       };
    //       scrollStep();
    //     });
    //   });

    //   if (!(await page.$(ridibooks.moreReviewBtn))) {
    //     break;
    //   }

    //   await page.waitForSelector(ridibooks.moreReviewBtn);

    //   await page.click(ridibooks.moreReviewBtn);

    //   reviewList = await page.$$(
    //     '#review_list_section > div.review_list_wrapper.js_review_list_wrapper.active > ul > li',
    //   );

    //   if (!(await page.$(ridibooks.moreReviewBtn))) {
    //     break;
    //   }

    //   console.log(reviewList.length);
    // }

 

크롤링한 데이터를 json 파일로 저장

  async createJsonFile(type: string, posts: any) {
    try {
      const result = type === 'webNovels' ? 'webNovels' : 'posts';

      for (let i = 0; i < posts.length; i++) {
        const imageUrls = posts[i].webContents.webContent.map(
          (item) => item.img,
        );

        const fileName = imageUrls.map((url) => {
          const matchResult = url.match(/(?<=cover\/)\d+/);
          if (matchResult) {
            return `[ridibooks] ${matchResult[0]}`;
          } else {
            return null;
          }
        });

        await fs.writeFile(
          `${result}/${fileName}.json`,
          JSON.stringify(posts[i].webContents),
        );

        if (posts[i].reviews) {
          await fs.writeFile(
            `reviews/${fileName}.json`,
            JSON.stringify(posts[i].reviews),
          );
        }
      }
    } catch (err) {
      console.error(err);
    }
  }

 

'프로젝트' 카테고리의 다른 글

api를 이용한 스크래핑 성능 개선  (0) 2024.04.15
퍼펫티어를 이용한 크롤링(2)  (0) 2024.04.11
최종프로젝트  (0) 2024.04.09
db 선택  (0) 2024.03.27
To-DuBu: KPT 회고록  (0) 2024.03.25