퍼펫티어 npm 설치
npm i puppeteer
페이지 띄우는 코드
const url = ridibooks.ridiPage;
const browser = await puppeteer.launch({
headless: true,
});
const page = await browser.newPage();
로그인, 진짜 별거 없다.. 실제로 로그인 페이지를 띄워서 로그인 정보를 입력 후 클릭한다.
async login(page: any, url: string) {
const { pageBtn, submitBtn, textCursor1, textCursor2 } = ridibooks.login;
await page.goto(url);
await page.waitForSelector(pageBtn);
await page.click(pageBtn);
await page.waitForSelector(submitBtn);
await page.click(textCursor1);
await page.keyboard.type(process.env.RIDI_ID, { delay: 100 });
await page.click(textCursor2);
await page.keyboard.type(process.env.RIDI_PW, { delay: 100 });
await page.click(submitBtn);
await page.waitForNavigation({ waitUntil: 'networkidle0' });
if (page.url() === url) return true;
}
목록에서 상세페이지 링크를 배열로 만들기
async getLinkList(page: any, type: string, url: string, maxPages: number) {
let currentPage = 1;
let linkList: any[] = [];
const result = type === 'webNovels' ? 'romance_serial' : 'webtoon';
while (currentPage <= maxPages) {
const newUrl = `${url}/bestsellers/${result}?page=${currentPage}&order=daily`;
console.log(`Start Crawling: ${newUrl}`);
await page.goto(newUrl, { waitUntil: 'networkidle0' });
await this.scrolling(page);
if (type === 'webNovels') {
const newLinks = await page.evaluate(() => {
const items = Array.from(
document.querySelectorAll(
'#__next > main > section > ul.fig-1w8zspb > li',
),
).slice(0);
return items.map((item) => {
const link = item.querySelector('a')?.getAttribute('href');
let rank = item.querySelector('div > div.fig-9njjsy')?.innerHTML;
if (+rank > 20) {
rank = null;
}
return { link, rank };
});
});
linkList.push(...newLinks);
currentPage += 1;
} else {
const newLinks = await page.evaluate(() => {
const items = Array.from(
document.querySelectorAll(
'#__next > main > section > ul.fig-1w8zspb > li',
),
);
return items.map((item) => {
const link = item.querySelector('a')?.getAttribute('href');
let rank = item.querySelector('div > div.fig-9njjsy')?.innerHTML;
if (+rank > 20) {
rank = null;
}
return { link, rank };
});
});
linkList.push(...newLinks);
currentPage += 1;
}
}
return linkList;
}
상세페이지에 들어가서 본격적인 크롤링
async scrapPostAndReview(rank: any, page: any, type: string) {
try {
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForSelector(
'div.header_thumbnail_wrap > div.header_thumbnail.book_macro_200.detail_scalable_thumbnail > div > div > div > img',
);
const url = page.url();
const webContent = await page.evaluate(() => {
const items = Array.from(
document.querySelectorAll('#page_detail > div > div > section'),
);
return items.map((item) => {
return {
// isAdult: item.querySelector(
// '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_thumbnail_wrap > div.header_thumbnail.book_macro_200.detail_scalable_thumbnail > div > div > span',
// )
// ? true
// : false,
title: item.querySelector(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div.info_title_wrap > h1',
)?.textContent,
img: item
.querySelector(
'div.header_thumbnail_wrap > div.header_thumbnail.book_macro_200.detail_scalable_thumbnail > div > div > div > img',
)
?.getAttribute('src'),
// score: item.querySelector(
// '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(3) > p > span > span.StarRate_Score',
// )?.textContent,
scoreParticipant: item.querySelector(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(3) > p > span > span.StarRate_ParticipantCount',
).textContent,
// userLikes: item.querySelector(
// '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_thumbnail_wrap > div.header_preference > button > span > span.button_text.js_preference_count',
// )?.textContent,
// status: item.querySelector(
// '#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(4) > p.metadata.metadata_info_series_complete_wrap > span.metadata_item.not_complete',
// )?.textContent,
pubDate: item
.querySelector(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.Header_Metadata_Block > ul:nth-child(2) > li.Header_Metadata_Item.book_info.published_date_info > ul > li',
)
?.textContent.trim(),
genre: (() => {
const genreList = Array.from(
document.querySelectorAll(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > p > a',
),
);
const genres = [];
Promise.all(
genreList.map((item) => {
const genre = item?.textContent;
genres.push(genre);
}),
);
const getGenres = new Set(genres);
return [...getGenres];
})(),
keyword: (() => {
const keywordList = Array.from(
document.querySelectorAll(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_box_module.detail_keyword.js_detail_keyword_module > ul > li',
),
);
console.log(keywordList);
const keywords = [];
Promise.all(
keywordList.map((item) => {
const keyword =
item.querySelector('button > span')?.textContent;
keywords.push(keyword);
}),
);
return [...keywords];
})(),
author: (() => {
const authorList = Array.from(
document.querySelectorAll(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(4) > p.metadata.metadata_writer > span',
),
);
const authors = [];
Promise.all(
authorList.map((item) => {
const author = item?.textContent;
const modifiedAuthor = author.replace(/\s+/g, '/');
authors.push(modifiedAuthor);
}),
);
return [...authors];
})(),
publish: item.querySelector(
'#page_detail > div.detail_wrap > div.detail_body_wrap > section > article.detail_header.trackable > div.header_info_wrap > div:nth-child(4) > p.metadata.file_info.publisher_info > a',
)?.textContent,
dsc: (() => {
const dscList = Array.from(
document.querySelectorAll('#introduce_book'),
);
const dscs = [];
Promise.all(
dscList.map((item) => {
const dsc = item.querySelector('p')?.textContent;
dscs.push(dsc);
}),
);
return [...dscs];
})(),
};
});
});
let isAdult = false;
if (type == 'webContents') {
isAdult = webContent.some((content) => content.genre.includes('성인'));
} else {
isAdult = await page.$eval('.badge_adult', (element) => !!element);
}
const scrapDate = new Date().toLocaleDateString();
const webContents = {
url,
rank,
isAdult,
webContent,
scrapDate,
};
let contentTitle = webContent.map((content) => content.title);
let reivewLength = webContent.map((content) => content.scoreParticipant);
if (+reivewLength == 0) {
return { contentTitle, webContents };
}
page.bringToFront();
const reviews = await this.scrapReviews(contentTitle, page);
return {
contentTitle,
webContents,
reviews,
};
} catch (err) {
console.log('❌❌❌❌', page.url(), err);
}
}
스크롤 함수
무한 스크롤이 되는 페이지 경우에는 사용자에게 빠른 응답을 주기 위해 모든 데이터를 갖고있는게 아닌
스크롤을 하며 유저가 페이지에 도달했을때 데이터를 보여준다. 그걸 위한 코드!
async scrolling(page: any) {
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
let totalHeight = 0;
const distance = 250;
const timer = setInterval(() => {
const scrollHeight = document.documentElement.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve(true);
}
}, 100);
});
});
}
이건.. 댓글을 자꾸 중복으로 가져와서 추가로 작성한 코드인데 어찌어찌 또 해결이 되서 사용하지 않는 코드이다.
// while (reviewList.length < 30) {
// await page.waitForSelector(
// '#review_list_section > div.review_list_wrapper.js_review_list_wrapper.active > ul > li',
// { waitUntil: 'networkidle0' },
// );
// await page.evaluate(async () => {
// await new Promise((resolve, reject) => {
// let totalHeight = 0;
// const distance = 100;
// const scrollStep = () => {
// window.scrollBy(0, distance);
// totalHeight += distance;
// const button = document.querySelector(
// '#review_list_section > div.review_list_wrapper.js_review_list_wrapper.active > ul > li',
// );
// if (button) {
// resolve(true);
// return;
// }
// if (!button) {
// resolve(true);
// return;
// }
// setTimeout(scrollStep, 3000);
// };
// scrollStep();
// });
// });
// if (!(await page.$(ridibooks.moreReviewBtn))) {
// break;
// }
// await page.waitForSelector(ridibooks.moreReviewBtn);
// await page.click(ridibooks.moreReviewBtn);
// reviewList = await page.$$(
// '#review_list_section > div.review_list_wrapper.js_review_list_wrapper.active > ul > li',
// );
// if (!(await page.$(ridibooks.moreReviewBtn))) {
// break;
// }
// console.log(reviewList.length);
// }
크롤링한 데이터를 json 파일로 저장
async createJsonFile(type: string, posts: any) {
try {
const result = type === 'webNovels' ? 'webNovels' : 'posts';
for (let i = 0; i < posts.length; i++) {
const imageUrls = posts[i].webContents.webContent.map(
(item) => item.img,
);
const fileName = imageUrls.map((url) => {
const matchResult = url.match(/(?<=cover\/)\d+/);
if (matchResult) {
return `[ridibooks] ${matchResult[0]}`;
} else {
return null;
}
});
await fs.writeFile(
`${result}/${fileName}.json`,
JSON.stringify(posts[i].webContents),
);
if (posts[i].reviews) {
await fs.writeFile(
`reviews/${fileName}.json`,
JSON.stringify(posts[i].reviews),
);
}
}
} catch (err) {
console.error(err);
}
}
'프로젝트' 카테고리의 다른 글
api를 이용한 스크래핑 성능 개선 (0) | 2024.04.15 |
---|---|
퍼펫티어를 이용한 크롤링(2) (0) | 2024.04.11 |
최종프로젝트 (0) | 2024.04.09 |
db 선택 (0) | 2024.03.27 |
To-DuBu: KPT 회고록 (0) | 2024.03.25 |