Java图片采集版本迭代总结

一、版本迭代

1、场景一

采集前几个级别的少量图片(千级),采用单线程单任务V1

2、场景二

越到后面的层级图片数量越大(万级),减少改动工作量,升级到了单线程分块任务V2,其中分块范围为手动输入。

3、场景三

后面层级的图片数据又跨了一个层级(十万级),由于时间 + 设备有限的限制下,开始采用多线程分块任务V3

4、场景四

程序挂着跑,一觉醒来发现“凉凉”,由于高频次的采集,导致IP被封禁,访问图片403,百度我的IP,可以查看当前用于访问的公网IP于是结合公司资源和网上技术,采用了动态IP代理进行采集V4

1、动态IP代理Http与Https代码示例
/*
 * Copyright (c) 2005, 2019, EVECOM Technology Co.,Ltd. All rights reserved.
 * EVECOM PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 *
 */
package util;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.net.Proxy.Type;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

/**
 * 描述 The type Http and https proxy.
 *
 * @author Harley Hong
 * @created 2019 /04/03 15:25:32
 */
public class HttpAndHttpsProxy {

    /**
     * The constant ARCGIS_ONLINE_REMOTE_SENSING_IMAGE_URL.
     */
    private static final String URL = "url";

    /**
     * The constant BASE_STORAGE_DIRECTORY.
     */
    private static final String DIR = "dir";

    /**
     * 描述 https代理.
     *
     * @param url   the url
     * @param param the param
     * @param proxy the proxy
     * @param port  the port
     * @return the string
     * @author Harley Hong
     * @created 2019 /04/03 15:25:32 Https proxy string.
     */
    public static String HttpsProxy(String url, String param, String proxy, int port) {
        HttpsURLConnection httpsConn;
        PrintWriter out = null;
        BufferedReader in = null;
        String result = "";
        BufferedReader reader = null;
        try {
            URL urlClient = new URL(url);
            System.out.println("请求的URL========:" + urlClient);
            SSLContext sc = SSLContext.getInstance("SSL");
            // 指定信任https
            sc.init(null, new TrustManager[] { new TrustAnyTrustManager() }, new java.security.SecureRandom());
            // 创建代理虽然是https也是Type.HTTP
            Proxy proxy1 = new Proxy(Type.HTTP, new InetSocketAddress(proxy, port));
            // 设置代理
            httpsConn = (HttpsURLConnection) urlClient.openConnection(proxy1);
            httpsConn.setSSLSocketFactory(sc.getSocketFactory());
            httpsConn.setHostnameVerifier(new TrustAnyHostnameVerifier());
            // 设置通用的请求属性
            httpsConn.setRequestProperty("accept", "*/*");
            httpsConn.setRequestProperty("connection", "Keep-Alive");
            httpsConn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 发送POST请求必须设置如下两行
            httpsConn.setDoOutput(true);
            httpsConn.setDoInput(true);
            // Url不存在中文情况,不做特殊处理
            String suffixDir = "/P00";
            String imageStorageDir = DIR + suffixDir;
            String fileName = "/P00.jpg";
            File file = new File(imageStorageDir);
            if (!file.exists()) {
                file.mkdirs();
            }
            // 读取数据流并写入到指定文件夹中
            DataInputStream dataInputStream = new DataInputStream(httpsConn.getInputStream());
            FileOutputStream fileOutputStream = new FileOutputStream(new File(imageStorageDir + fileName));
            ByteArrayOutputStream output = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024];
            int length;
            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
            fileOutputStream.write(output.toByteArray());
            System.out.println(imageStorageDir + fileName + " download success");
            dataInputStream.close();
            fileOutputStream.close();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (IOException e) {
            }
            try {
                if (in != null) {
                    in.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            if (out != null) {
                out.close();
            }
        }

        return result;
    }

    /**
     * 描述 http代理.
     *
     * @param url   the url
     * @param param the param
     * @param proxy the proxy
     * @param port  the port
     * @return the string
     * @author Harley Hong
     * @created 2019 /04/03 15:25:32 Http proxy string.
     */
    public static String HttpProxy(String url, String param, String proxy, int port) {
        HttpURLConnection httpConn = null;
        PrintWriter out = null;
        BufferedReader in = null;
        String result = "";
        BufferedReader reader = null;
        try {
            URL urlClient = new URL(url);
            System.out.println("请求的URL========:" + urlClient);
            // 创建代理
            Proxy proxy1 = new Proxy(Type.HTTP, new InetSocketAddress(proxy, port));
            // 设置代理
            httpConn = (HttpURLConnection) urlClient.openConnection(proxy1);
            // 设置通用的请求属性
            httpConn.setRequestProperty("accept", "*/*");
            httpConn.setRequestProperty("connection", "Keep-Alive");
            httpConn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 发送POST请求必须设置如下两行
            httpConn.setDoOutput(true);
            httpConn.setDoInput(true);
            // 获取URLConnection对象对应的输出流
            out = new PrintWriter(httpConn.getOutputStream());
            // 发送请求参数
            out.print(param);
            // flush输出流的缓冲
            out.flush();
            // 定义BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(httpConn.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result += line;
            }
            // 断开连接
            httpConn.disconnect();
            System.out.println("====result====" + result);
            System.out.println("返回结果:" + httpConn.getResponseMessage());
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) {
                    reader.close();
                }
            } catch (IOException e) {
            }
            try {
                if (in != null) {
                    in.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            if (out != null) {
                out.close();
            }
        }

        return result;
    }

    /**
     * 描述 The type Trust any trust manager.
     *
     * @author Harley Hong
     * @created 2019 /04/03 15:25:32
     */
    public static class TrustAnyTrustManager implements X509TrustManager {
        public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
        }
        public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
        }
        public X509Certificate[] getAcceptedIssuers() {
            return new X509Certificate[] {};
        }
    }

    /**
     * 描述 The type Trust any hostname verifier.
     *
     * @author Harley Hong
     * @created 2019 /04/03 15:25:32
     */
    public static class TrustAnyHostnameVerifier implements HostnameVerifier {
        public boolean verify(String hostname, SSLSession session) {
            return true;
        }
    }

    /**
     * 描述 The entry point of application.
     *
     * @param args the input arguments
     * @author Harley Hong
     * @created 2019 /04/03 15:25:32
     */
    public static void main(String[] args) {
        HttpsProxy(URL, "", "220.186.189.193", 4307);
        HttpProxy(URL, "", "127.0.0.1", 81);
    }

}

5、场景五

由于之前的恶意与高频采集,导致服务越来越不友善了,动态IP代理本来访问慢且有时会丢包,所以把需要采集的目录和文件和已经采集的,进行对比分析,存库供于补采(即采集:1;未采集:0),升级为图片补采程序V5

6、场景六

有对比分析把未采集的存库标为0,就要把采集成功的图片置为1,升级采集成功库表更新程序V6

二、核心程序

1、分线程动态IP

GetDynamicIpBySunIp getDynamicIpBySunIp = new GetDynamicIpBySunIp(30000);
Thread thread = new Thread(getDynamicIpBySunIp);
thread.start();
 /**
     * 描述 根据太阳Ip获取动态Ip The type Get dynamic ip by sun ip.
     *
     * @author Harley Hong
     * @created 2019 /03/31 20:48:55
     */
    public static class GetDynamicIpBySunIp implements Runnable {

        /**
         * Ip更换时间间隔(默认一分钟) The Ip interval time ms.
         */
        int ipIntervalTimeMs;

        public GetDynamicIpBySunIp(int ipIntervalTimeMs) {
            this.ipIntervalTimeMs = ipIntervalTimeMs;
        }

        @Override
        public void run() {
            // 循环到采集线程结束
            while (!threadOver) {
                // 获取动态Ip
                Map<String, Object> ipMap = getDynamicIpListBySun();
                if (ipMap.size() > 0) {
                    // 开始进行Ip代理
                    ip = (String) ipMap.get("ip");
                    port = Integer.valueOf(String.valueOf(ipMap.get("port")));
                }
                // 该线程睡眠,相当于时间间隔内不更换Ip
                try {
                    Thread.sleep(ipIntervalTimeMs);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    /**
     * 描述 获取动态的Ip列表,每次只取一个ip Gets dynamic ip list by sun.
     *
     * @return the dynamic ip list by sun
     * @author Harley Hong
     * @created 2019 /03/31 20:55:53
     */
    public static Map<String, Object> getDynamicIpListBySun() {
        Map<String, Object> ipMap = new HashMap<>();
        try {
            // 根据url地址获取动态代理Ip
            URL url = new URL(
                    "http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pro=&city=0&yys=0&port=11&pack=24673&ts=1&ys=1&cs=1&lb=1&sb=0&pb=4&mr=0&regions=");
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.setConnectTimeout(3000);
            connection = (HttpURLConnection) url.openConnection();
            InputStream raw = connection.getInputStream();
            InputStream in = new BufferedInputStream(raw);
            byte[] data = new byte[in.available()];
            int bytesRead;
            int offset = 0;
            while (offset < data.length) {
                bytesRead = in.read(data, offset, data.length - offset);
                if (bytesRead == -1) {
                    break;
                }
                offset += bytesRead;
            }
            in.close();
            raw.close();
            String[] resultArr = new String(data, "UTF-8").split("\n");
            for (String result : resultArr) {
                JSONObject jsonObject = JSON.parseObject(result);
                JSONArray jsonDataArr = jsonObject.getJSONArray("data");
                Map<String, Object> dataMap = (Map<String, Object>) jsonDataArr.get(0);
                if (dataMap != null) {
                    String ip = (String) dataMap.get("ip");
                    String port = (String) dataMap.get("port");
                    if (StringUtils.isNotEmpty(ip)) {
                        ipMap.put("ip", ip);
                        ipMap.put("port", port);
                    }
                }
            }
            System.out.println("获取IP成功:" + ipMap);
        } catch (Exception e) {
            e.printStackTrace();
            System.err.println("获取IP出错");
        }
        return ipMap;
    }

2、多线程分块采集

// 使用多线程采集,开启各个子线程,各线程仅对自己的图片块进行操作
for (int i = 1; i <= threadNum; i++) {
	Map<String, Object> threadStartAndEndExeDiv = threadStartAndEndImgDiv(i, startRowNum, endRowNum,
			threadNum);
	// 图片分块采集
	int startRowNumImgDiv = (int) threadStartAndEndExeDiv.get("startRow");
	int endRowNumImgDiv = (int) threadStartAndEndExeDiv.get("endRow");
	ArcGISRSImgColByDataBaseFinal arcGISRSImgColByDataBase = new ArcGISRSImgColByDataBaseFinal("线程" + i,
			level, con, list, pstm, startRowNumImgDiv, endRowNumImgDiv, startRowNum, threadNum);
	arcGISRSImgColByDataBase.start();
}

三、最后总结

采集一定要做动态IP,分场景去实现采集和支持补采很有必要。默认情况下,一个线程的栈要预留1M的内存空间,而一个进程中可用的内存空间只有2G,所以理论上一个进程中最多可以开2048个线程,但是内存当然不可能完全拿来作线程的栈,所以实际数目要比这个值要小。

Leave a Reply

Your email address will not be published. Required fields are marked *