使用 Puppeteer 库通过自动化浏览器来访问百度图片搜索,并在搜索结果中下载图片。代码分为两部分:
- 自动化浏览器任务:使用 Puppeteer 浏览百度图片搜索并获取图片 URL。
- 图片下载:检查图片 URL 类型(base64 或 URL),并保存图片到本地。
如果无法通过npm安装:
确保 Puppeteer 的下载地址已经指向淘宝镜像
set PUPPETEER_DOWNLOAD_HOST=https://npmmirror.com/mirrors
示例:
javascript">import puppeteer from 'puppeteer';
import http from "http";
import https from "https";
import fs from "fs";
import {promisify} from 'util';
import qs from "querystring";
import {v4} from "uuid";(async () => {const browser = await puppeteer.launch({headless: false, // 打开浏览器browser: "chrome",slowMo: 250, // slow down by 250ms// executablePath:'', // 其它浏览器打开地址});const page = await browser.newPage();await page.goto('https://image.baidu.com/');console.log('goto: https://image.baidu.com/');await page.setViewport({width: 1920, height: 1080});let count = 0;while (count < 60) {try {await page.focus('[name="word"]');break;} catch (err) {count++;await page.reload();console.log('[name="word"] selector not found, try again');}}// 通过属性获取指定input,填入搜索框文字await page.keyboard.sendCharacter('卡皮巴拉');// 等于上面两条 page.focus page.keyboard.sendCharacter// await page.type('#kw', '卡皮巴拉', { delay: 100 });await page.click('.submit-btn_ZmEXZ');// await page.reload();page.on('load', async () => {console.log('page loaded!');await page.waitForSelector('.main_img');// evaluate 中会在浏览器端运行js代码。const src = await page.evaluate(() => {const images = document.querySelectorAll('.main_img');// 在puppeteer打开的浏览器查看console.log(images);// return images.map(img => img.src);return Array.prototype.map.call(images, img => img.src);});await Promise.all(src.map(src => {console.log(src);return downloadImage(src, './image/p1/');}));await page.screenshot({path: './screenshot.png',});await browser.close();});
})();const urlToImage = (url, dir, callback) => {const mod = /^https:/.test(url) ? https : http;const ext = qs.parse(url).f.split('?').shift().toLowerCase().replace('jpeg', 'jpg');const file = `${dir}${v4()}.${ext}`;return new Promise((resolve, reject) => {mod.get(url, res => {// 使用 fs.createWriteStream 创建文件流const writeStream = fs.createWriteStream(file);// 将响应数据流管道到文件写入流res.pipe(writeStream);// 在写入完成时调用回调writeStream.on('finish', () => {console.log(file);resolve();});});});
};const base64ToImage = async (base64, dir) => {try {const matches = base64.match(/^data:(.+?);base64,(.+)$/);const ext = matches[1].split('/')[1].replace('jpeg', 'jpg');const file = `${dir}${v4()}.${ext}`;await promisify(fs.writeFile)(file, matches[2], 'base64');} catch (err) {console.log(err);}
}async function downloadImage(src, dir) {if (/data:(.+?);base64,(.+)/.test(src)) {await base64ToImage(src, dir);} else if (src.startsWith('http')) {await urlToImage(src, dir);} else {console.log('[error] download fail, unsupported image type!', src);}
}