利用WebCollector爬取wallhere壁纸

WebCollector 是一个无须配置、便于二次开发的 JAVA 爬虫框架(内核),提供精简的的 API,只需少量代码即可实现一个功能强大的爬虫。

源码中集成了 Jsoup,可进行精准的网页解析,2.x 版本中集成了 selenium,可以处理 JavaScript 生成的数据。

操作浏览器:

谷歌浏览器chromedriver(使用时需要与本地浏览器版本一致):http://npm.taobao.org/mirrors/chromedriver/

无界面浏览器phantomjs:https://phantomjs.org/download.html

QQ截图20200104093304.png

QQ截图20200104093419.png

QQ截图20200104094207.png

QQ截图20200104094243.png

代码:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//采集wallhere
public class CollectionWallhere {
    private static WebDriver driver;
    private static ChromeOptions options;
    private static String html;
    private static String type;
    public static String imgurl=null;
    static {
//        System.setProperty("webdriver.chrome.driver", "./drivers/chromedriver.exe");
//        options= new ChromeOptions();
//        Map<String, Object> prefs = new HashMap<String, Object>();
//        prefs.put("profile.managed_default_content_settings.images", 2);
//        options.setExperimentalOption("prefs", prefs);
//        //=================↑禁止图片加载↑=======================
//        driver = new ChromeDriver(options);
    }
    public static void main(String[] args){
        Scanner scanner =new Scanner(System.in);
        System.out.println("采集图片需要选择类型,如:动漫,风景,建筑,美女....");
        //类型:动漫 = 页数:5105
        System.out.println("请输入你需要采集的类型:");
        String lx = scanner.next();
        System.out.println("请输入你需要采集的页数:");
        Integer page = scanner.nextInt();
        new CollectionWallhere().wallhereRun(lx,page);
    }

    /**
     * 采集启动
     */
    public static void wallhereRun(String lx,Integer page){
        WallhereConfig();
        try {
            type = URLEncoder.encode(lx, "UTF-8" );
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        driver.get("https://wallhere.com/zh/wallpapers?q="+type);//访问
        try {
            Thread.sleep(10 * 1000);//延时
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        html = driver.getPageSource();//获取网页代码
        WallhereByImgByJson(page);
    }

    /**
     * 初始配置
     */
    public static void WallhereConfig(){
        System.setProperty("webdriver.chrome.driver", "./drivers/chromedriver.exe");
        options= new ChromeOptions();
        Map<String, Object> prefs = new HashMap<String, Object>();
        prefs.put("profile.managed_default_content_settings.images", 2);
        options.setExperimentalOption("prefs", prefs);
        //=================↑禁止图片加载↑=======================
        driver = new ChromeDriver(options);
    }

    /**
     * Wallhere图片采集
     * @param page
     * @return
     */
    public static void WallhereByImgByJson(Integer page){
        HashMap<String, String> map =null;
        for (int i = 1; i <page+1 ; i++) {
            map = (HashMap<String, String>) getWallhereCookies();//刷新页面获取cookie
            List<String> listurl = getWallhereImgByUrl("https://wallhere.com/zh/wallpapers?q="+type+"&page="+i+"&format=json",map);
            for (String url:listurl) {
                imgurl = getWallhereByImgUrl(url,map);//获取到图片url
                System.out.println("下载:"+imgurl);
                new Thread() {
                    @Override
                    public void run() {
                        downImages(System.getProperty("user.dir")+"/img",imgurl);//下载图片
                    }
                }.start();
            }
            try {
                Thread.sleep(3 * 60 * 1000);//延时
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        driver.quit();//结束关闭
    }

    public static Map getWallhereCookies(){
        driver.navigate().refresh();//刷新页面
        try {
            Thread.sleep(5 * 1000);//延时
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        //获取cookie列表
        Set<Cookie> cookies = driver.manage().getCookies();
        HashMap<String, String> map = new HashMap<String,String>();
        for (Cookie c:cookies) {
            map.put(c.getName(), c.getValue());
        }
        System.out.println("获取cookie:"+map.toString());
        return map;
    }

    /**
     * 获取一组图片页面地址
     * @param url
     * @param map
     * @return
     */
    public static List getWallhereImgByUrl(String url,Map map){
//        String url = "https://wallhere.com/zh/wallpapers?q=%E5%8A%A8%E6%BC%AB&page=2&format=json";
        Document doc = null;
        try {
            doc = Jsoup.connect(url)
                    .header("Accept", "*/*")
                    .header("Accept-Encoding", "gzip, deflate")
                    .header("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
                    .header("Content-Type", "application/json;charset=UTF-8")
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36")
                    .cookies(map)
                    .ignoreContentType(true)
                    .postDataCharset("GBK")
                    .timeout(60*1000)
                    .get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        Matcher m = Pattern.compile("<a href=\\\"\\\\&quot;(.*?)\\\\&quot;\\\"><img alt=").matcher(doc.html());
        List<String> list = new LinkedList<>();
        while (m.find()){
            list.add("https://wallhere.com"+m.group(1).replace("\\/","/"));
//            System.out.println("https://wallhere.com"+m.group(1).replace("\\/","/"));
        }
        return list;
    }

    /**
     * 获取图片url
     * @return
     */
    public static String getWallhereByImgUrl(String url,Map map){
        Document doc = null;
        String imgurl=null;
        try {
//            String url = "https://wallhere.com/zh/wallpaper/1275327";
            doc = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36")
                    .cookies(map)
                    .ignoreContentType(true)
                    .postDataCharset("GBK")
                    .timeout(60*1000)
                    .get();
        } catch (IOException e) {
            e.printStackTrace();
        }
//        System.out.println(doc.html());
        Matcher m = Pattern.compile("<meta name=\\\"twitter:image\\\" content=\\\"(.*?)\\\"> ").matcher(doc.html());
        while (m.find()){
            imgurl = m.group(1);
        }
        return imgurl;
    }
    /**
     * 下载图片到指定目录
     * @param filePath 文件路径
     * @param imgUrl   图片URL
     */
    public static void downImages(String filePath, String imgUrl) {
        // 若指定文件夹没有,则先创建
        File dir = new File(filePath);
        if (!dir.exists()) {
            dir.mkdirs();
        }
        // 截取图片文件名
        String fileName = imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
        try {
            // 文件名里面可能有中文或者空格,所以这里要进行处理。但空格又会被URLEncoder转义为加号
            String urlTail = URLEncoder.encode(fileName, "UTF-8");
            // 因此要将加号转化为UTF-8格式的%20
            imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");

        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        // 写出的路径
        File file = new File(filePath + File.separator + fileName.replace("!d",""));
        BufferedOutputStream out=null;
        InputStream in=null;
        try {
            // 获取图片URL
            URL url = new URL(imgUrl);
            // 获得连接
            URLConnection connection = url.openConnection();
            // 设置延时相应时间
            connection.setConnectTimeout(60 * 1000);
            connection.setReadTimeout(60 * 1000);
            //防止屏蔽程序抓取而返回403错误
            connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            // 获得输入流
             in = connection.getInputStream();
            // 获得输出流
             out = new BufferedOutputStream(new FileOutputStream(file));
            // 构建缓冲区
            byte[] buf = new byte[1024];
            int size;
            // 写入到文件
            while (-1 != (size = in.read(buf))) {
                out.write(buf, 0, size);
            }
        } catch (MalformedURLException e) {
            System.out.println("下载异常A");
//            e.printStackTrace();
        } catch (IOException e) {
            System.out.println("下载异常B");
//            e.printStackTrace();
        }finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                System.out.println("关闭下载流失败!");
            }
        }
    }
}

本代码需要配合chromedriver.exe浏览器使用。另外我下发一份打包好配合jdk1.8.x可直接使用的包。包里有chromedriver.exe浏览器可供开发使用。


转载请注明出处凌夕博客 »漫小猫原文地址《利用WebCollector爬取wallhere壁纸

相关推荐

发表评论

路人甲 表情
Ctrl+Enter快速提交

网友评论(0)