代码样例-Http隧道

本文档包含编程请求http隧道的代码样例,供开发者参考。

代码样例使用说明

  1. 代码样例不能直接运行,因为代码中的隧道服务器域名mytunnelhost、端口mytunnelport、隧道idmytid、密码mypassword都是虚构的,您替换成自己真实的信息,就可以正常运行了。查看我的隧道信息>>
  2. 代码样例正常运行所需的运行环境和注意事项在样例末尾均有说明,使用前请仔细阅读。
  3. 使用代码样例过程中遇到问题请联系售后客服,我们会为您提供技术支持。

Python2

requests

requests(推荐)

使用提示

  1. 基于requests的代码样例支持访问http,https网页,推荐使用。
  2. requests不是python原生库,需要安装才能使用: pip install requests
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""使用requests请求代理服务器
请求http和https网页均适用
"""

import requests

# 要访问的目标网页
page_urls = ["http://dev.kdlapi.com/testproxy",
             "https://dev.kdlapi.com/testproxy",
             ]

# 隧道服务器
tunnel_host = "mytunnelhost"
tunnel_port = "mytunnelport"

# 隧道id和密码
tid = "mytid"
password = "mypassword"

proxies = {
    "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
    "https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
}

headers = {
    "Accept-Encoding": "Gzip",  # 使用gzip压缩传输数据让访问更快
}

for url in page_urls:
    r = requests.get(url, proxies=proxies, headers=headers)

    print r.status_code  # 获取Reponse的返回码

    if r.status_code == 200:
        r.enconding = "utf-8"  # 设置返回内容的编码
        print r.content  # 获取页面内容

urllib2

urllib2

使用提示

  • 基于urllib2的代码样例同时支持访问http和https网页
  • 运行环境要求 python2.6 / 2.7
#!/usr/bin/env python
#-*- coding: utf-8 -*-

"""使用urllib2请求代理服务器
请求http和https网页均适用
"""

import urllib2
import zlib
import ssl

ssl._create_default_https_context = ssl._create_unverified_context  # 全局取消证书验证,避免访问https网页报错

#要访问的目标网页
page_urls = ["http://dev.kdlapi.com/testproxy",
             "https://dev.kdlapi.com/testproxy",
             ]

# 隧道服务器
tunnel_host = "mytunnelhost"
tunnel_port = "mytunnelport"

# 隧道id和密码
tid = "mytid"
password = "mypassword"

proxies = {
    "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
    "https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
}

for url in page_urls:
    req = urllib2.Request(url)
    req.add_header("Accept-Encoding", "Gzip") #使用gzip压缩传输数据让访问更快
    proxy_hander = urllib2.ProxyHandler(proxies)
    opener = urllib2.build_opener(proxy_hander)
    urllib2.install_opener(opener)
    r = urllib2.urlopen(req)

    print r.code
    content_encoding = r.headers.getheader("Content-Encoding")
    if content_encoding and "gzip" in content_encoding:
        print zlib.decompress(r.read(), 16+zlib.MAX_WBITS) #获取页面内容
    else:
        print r.read() #获取页面内容

Python3

requests

requests(推荐)

使用提示

  1. 基于requests的代码样例支持访问http,https网页,推荐使用
  2. requests不是python原生库,需要安装才能使用: pip install requests
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""使用requests请求代理服务器
请求http和https网页均适用
"""

import requests

# 要访问的目标网页
page_urls = ["http://dev.kdlapi.com/testproxy",
             "https://dev.kdlapi.com/testproxy",
             ]

# 隧道服务器
tunnel_host = "mytunnelhost"
tunnel_port = "mytunnelport"

# 隧道id和密码
tid = "mytid"
password = "mypassword"

proxies = {
        "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
        "https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
    }

headers = {
    "Accept-Encoding": "Gzip",  # 使用gzip压缩传输数据让访问更快
}

for url in page_urls:
    r = requests.get(url, proxies=proxies, headers=headers)

    print (r.status_code)  # 获取Reponse的返回码

    if r.status_code == 200:
        r.enconding = "utf-8"  # 设置返回内容的编码
        print (r.content)  # 获取页面内容

urllib

urllib

使用提示

  • 基于urllib的代码样例同时支持访问http和https网页
  • 运行环境要求 python3.x
import urllib.request
import zlib
import ssl

ssl._create_default_https_context = ssl._create_unverified_context  # 全局取消证书验证,避免访问https网页报错

"""使用urllib.request模块请求代理服务器,http和https网页均适用"""

#要访问的目标网页
page_urls = ["http://dev.kdlapi.com/testproxy",
             "https://dev.kdlapi.com/testproxy",
             ]

# 隧道服务器
tunnel_host = "mytunnelhost"
tunnel_port = "mytunnelport"

# 隧道id和密码
tid = "mytid"
password = "mypassword"

proxies = {
        "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
        "https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
    }

headers = {
    "Accept-Encoding": "Gzip",  # 使用gzip压缩传输数据让访问更快
}

for url in page_urls:
    proxy_hander = urllib.request.ProxyHandler(proxies)
    opener = urllib.request.build_opener(proxy_hander)

    req = urllib.request.Request(url=url, headers=headers)

    result = opener.open(req)
    print(result.status)  # 获取Response的返回码

    content_encoding = result.headers.get('Content-Encoding')
    if content_encoding and "gzip" in content_encoding:
        print(zlib.decompress(result.read(), 16 + zlib.MAX_WBITS).decode('utf-8'))  # 获取页面内容
    else:
        print(result.read().decode('utf-8'))  # 获取页面内容

Python-Scrapy

scrapy项目标准目录结构如下:
scrapy项目结构

使用提示

  1. http/https网页均可适用
  2. scrapy不是python原生库,需要安装才能使用: pip install scrapy
  3. 在第一级scrapy_proxy目录下运行如下命令查看结果:scrapy crawl main
middlewares.py

middlewares.py里添加如下代码进行代理设置

import base64
import logging
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

# 隧道服务器
tunnel_host = "mytunnelhost"
tunnel_port = "mytunnelport"

# 隧道id和密码
tid = "mytid"
password = "mypassword"

logger = logging.getLogger(__name__)

# 代理中间件
class ProxyMiddleware(object):

    def process_request(self, request, spider):
        proxy_url = 'http://%s:%s@%s:%s' % (tid, password, tunnel_host, tunnel_port)
        request.meta['proxy'] = proxy_url  # 设置代理
        logger.debug("using proxy: {}".format(request.meta['proxy']))
        # 设置代理身份认证
        # Python3 写法
        auth = "Basic %s" % (base64.b64encode(('%s:%s' % (tid, password)).encode('utf-8'))).decode('utf-8')
        # Python2 写法
        # auth = "Basic " + base64.b64encode('%s:%s' % (tid, password))
        request.headers['Proxy-Authorization'] = auth


class AgentMiddleware(UserAgentMiddleware):
    """
        User-Agent中间件, 设置User-Agent
    """
    def __init__(self, user_agent=''):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0'
        request.headers.setdefault('User-Agent', ua)

settings.py

settings.py里设置DOWNLOADER_MIDDLEWARES使新增的middleware生效

ROBOTSTXT_OBEY = False  # 将此变量设为False, 提高成功率
DOWNLOADER_MIDDLEWARES = {
    'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
    'scrapy_proxy.middlewares.ProxyMiddleware': 100,
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 2,
    'scrapy_proxy.middlewares.AgentMiddleware': 1,
}

main.py

在spiders目录下手动创建爬虫文件main.py

# -*- coding: utf-8 -*-
import scrapy

class MainSpider(scrapy.Spider):
    """
        spider文件, 以爬取京东首页为例
    """
    name = "main"
    allowed_domains = ["www.jd.com"]
    start_urls = ['https://www.jd.com']

    def parse(self, response):
        # 获取返回内容(请求ip)
        print('------ response ------', response.text)

Java

jdk

使用原生库

使用提示

  1. 此样例同时支持访问http和https网页
  2. 运行环境要求 jdk >= 1.6
package com.kuaidaili.sdk;

import java.util.HashMap;
import java.util.Map;

/**
 * 使用jdk原生库请求代理服务器
 * 请求http和https网页均适用
 */
public class TestProxy {

    private static String pageUrl1 = "http://dev.kdlapi.com/testproxy"; //要访问的目标网页 http
    private static String pageUrl2 = "https://dev.kdlapi.com/testproxy"; //要访问的目标网页 https
    private static String tunnelHost = "mytunnelhost"; //隧道服务器域名
    private static String tunnelPort = "mytunnelport"; //隧道服务器端口
    private static String username = "myusername"; //隧道id
    private static String password = "mypassword"; //密码

    public static void main(String[] args) {
        HttpRequest request = new HttpRequest();
        Map<String, String> params = new HashMap<String, String>();
        Map<String, String> headers = new HashMap<String, String>();

        headers.put("Accept-Encoding", "gzip"); //使用gzip压缩传输数据让访问更快

        Map<String, String> proxySettings = new HashMap<String, String>();
        proxySettings.put("ip", tunnelHost);
        proxySettings.put("port", tunnelPort);
        proxySettings.put("username", username);
        proxySettings.put("password", password);

        try{
            HttpResponse response = request.sendGet(pageUrl1, params, headers, proxySettings);
            System.out.println(response.getCode());
            System.out.println(response.getContent());
        }
        catch (Exception e) {
            e.printStackTrace();
        }

        try{
            HttpResponse response = request.sendGet(pageUrl2, params, headers, proxySettings);
            System.out.println(response.getCode());
            System.out.println(response.getContent());
        }
        catch (Exception e) {
            e.printStackTrace();
        }           
    }
}
查看工具类HttpRequest和HttpResponse

HttpRequest.java

package com.kuaidaili.sdk;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.Authenticator;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.PasswordAuthentication;
import java.net.Proxy;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.Vector;
import java.util.zip.GZIPInputStream;

/**
 * HTTP请求对象
 */
public class HttpRequest {

    private String defaultContentEncoding;
    private int connectTimeout = 1000;
    private int readTimeout = 1000;

    public HttpRequest() {
        this.defaultContentEncoding = Charset.defaultCharset().name();
    }

    /**
     * 发送GET请求
     *
     * @param urlString URL地址
     * @param proxySettings 代理设置,null代表不设置代理
     * @return 响应对象
     */
    public HttpResponse sendGet(String urlString, final Map<String, String> proxySettings) throws IOException {
        return this.send(urlString, "GET", null, null, proxySettings);
    }

    /**
     * 发送GET请求
     *
     * @param urlString URL地址
     * @param params 参数集合
     * @param proxySettings 代理设置,null代表不设置代理
     * @return 响应对象
     */
    public HttpResponse sendGet(String urlString, Map<String, String> params, final Map<String, String> proxySettings)
            throws IOException {
        return this.send(urlString, "GET", params, null, proxySettings);
    }

    /**
     * 发送GET请求
     *
     * @param urlString URL地址
     * @param params 参数集合
     * @param headers header集合
     * @param proxySettings 代理设置,null代表不设置代理
     * @return 响应对象
     */
    public HttpResponse sendGet(String urlString, Map<String, String> params,
            Map<String, String> headers, final Map<String, String> proxySettings) throws IOException {
        return this.send(urlString, "GET", params, headers, proxySettings);
    }

    /**
     * 发送POST请求
     *
     * @param urlString URL地址
     * @param proxySettings 代理设置,null代表不设置代理
     * @return 响应对象
     */
    public HttpResponse sendPost(String urlString, final Map<String, String> proxySettings) throws IOException {
        return this.send(urlString, "POST", null, null, proxySettings);
    }

    /**
     * 发送POST请求
     *
     * @param urlString URL地址
     * @param params 参数集合
     * @param proxySettings 代理设置,null代表不设置代理
     * @return 响应对象
     */
    public HttpResponse sendPost(String urlString, Map<String, String> params, final Map<String, String> proxySettings)
            throws IOException {
        return this.send(urlString, "POST", params, null, proxySettings);
    }

    /**
     * 发送POST请求
     *
     * @param urlString URL地址
     * @param params 参数集合
     * @param headers header集合
     * @param proxySettings 代理设置,null代表不设置代理
     * @return 响应对象
     */
    public HttpResponse sendPost(String urlString, Map<String, String> params,
            Map<String, String> headers, final Map<String, String> proxySettings) throws IOException {
        return this.send(urlString, "POST", params, headers, proxySettings);
    }

    /**
     * 发送HTTP请求
     */
    private HttpResponse send(String urlString, String method,
            Map<String, String> parameters, Map<String, String> headers, final Map<String, String> proxySettings)
            throws IOException {
        HttpURLConnection urlConnection = null;

        if (method.equalsIgnoreCase("GET") && parameters != null) {
            StringBuffer param = new StringBuffer();
            int i = 0;
            for (String key : parameters.keySet()) {
                if (i == 0)
                    param.append("?");
                else
                    param.append("&");
                param.append(key).append("=").append(URLEncoder.encode(parameters.get(key), "utf-8"));
                i++;
            }
            urlString += param;
        }
        URL url = new URL(urlString);
        if(proxySettings != null){
            Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxySettings.get("ip"), Integer.parseInt(proxySettings.get("port"))));
            urlConnection = (HttpURLConnection) url.openConnection(proxy);
            if(proxySettings.containsKey("username")){
                Authenticator authenticator = new Authenticator() {
                    public PasswordAuthentication getPasswordAuthentication() {
                        return (new PasswordAuthentication(proxySettings.get("username"),
                                proxySettings.get("password").toCharArray()));
                    }
                };
                Authenticator.setDefault(authenticator);
            }
        }
        else{
            urlConnection = (HttpURLConnection) url.openConnection();
        }

        urlConnection.setRequestMethod(method);
        urlConnection.setDoOutput(true);
        urlConnection.setDoInput(true);
        urlConnection.setUseCaches(false);

        urlConnection.setConnectTimeout(connectTimeout);
        urlConnection.setReadTimeout(readTimeout);

        if (headers != null)
            for (String key : headers.keySet()) {
                urlConnection.addRequestProperty(key, headers.get(key));
            }

        if (method.equalsIgnoreCase("POST") && parameters != null) {
            StringBuffer param = new StringBuffer();
            int i = 0;
            for (String key : parameters.keySet()) {
                if(i > 0) param.append("&");
                param.append(key).append("=").append(URLEncoder.encode(parameters.get(key), "utf-8"));
                i++;
            }
            System.out.println(param.toString());
            urlConnection.getOutputStream().write(param.toString().getBytes());
            urlConnection.getOutputStream().flush();
            urlConnection.getOutputStream().close();
        }

        return this.makeContent(urlString, urlConnection);
    }

    /**
     * 得到响应对象
     */
    private HttpResponse makeContent(String urlString,
            HttpURLConnection urlConnection) throws IOException {
        HttpResponse response = new HttpResponse();
        try {
            InputStream in = urlConnection.getInputStream();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in));
            if ("gzip".equals(urlConnection.getContentEncoding())) bufferedReader =  new BufferedReader(new InputStreamReader(new GZIPInputStream(in)));
            response.contentCollection = new Vector<String>();
            StringBuffer temp = new StringBuffer();
            String line = bufferedReader.readLine();
            while (line != null) {
                response.contentCollection.add(line);
                temp.append(line).append("\r\n");
                line = bufferedReader.readLine();
            }
            bufferedReader.close();

            String encoding = urlConnection.getContentEncoding();
            if (encoding == null)
                encoding = this.defaultContentEncoding;

            response.urlString = urlString;

            response.defaultPort = urlConnection.getURL().getDefaultPort();
            response.file = urlConnection.getURL().getFile();
            response.host = urlConnection.getURL().getHost();
            response.path = urlConnection.getURL().getPath();
            response.port = urlConnection.getURL().getPort();
            response.protocol = urlConnection.getURL().getProtocol();
            response.query = urlConnection.getURL().getQuery();
            response.ref = urlConnection.getURL().getRef();
            response.userInfo = urlConnection.getURL().getUserInfo();
            response.contentLength = urlConnection.getContentLength();

            response.content = new String(temp.toString().getBytes());
            response.contentEncoding = encoding;
            response.code = urlConnection.getResponseCode();
            response.message = urlConnection.getResponseMessage();
            response.contentType = urlConnection.getContentType();
            response.method = urlConnection.getRequestMethod();
            response.connectTimeout = urlConnection.getConnectTimeout();
            response.readTimeout = urlConnection.getReadTimeout();

            return response;
        } catch (IOException e) {
            throw e;
        } finally {
            if (urlConnection != null){
                urlConnection.disconnect();
            }
        }
    }

    public static byte[] gunzip(byte[] bytes) {  
        if (bytes == null || bytes.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        ByteArrayInputStream in = new ByteArrayInputStream(bytes);  
        try {  
            GZIPInputStream ungzip = new GZIPInputStream(in);  
            byte[] buffer = new byte[256];  
            int n;  
            while ((n = ungzip.read(buffer)) >= 0) {  
                out.write(buffer, 0, n);  
            }  
        } catch (IOException e) {  
            System.err.println("gzip uncompress error.");
            e.printStackTrace();
        }  

        return out.toByteArray();  
    }

    /**
     * 得到默认的响应字符集
     */
    public String getDefaultContentEncoding() {
        return this.defaultContentEncoding;
    }

    /**
     * 设置默认的响应字符集
     */
    public void setDefaultContentEncoding(String defaultContentEncoding) {
        this.defaultContentEncoding = defaultContentEncoding;
    }

    public int getConnectTimeout() {
        return connectTimeout;
    }

    public void setConnectTimeout(int connectTimeout) {
        this.connectTimeout = connectTimeout;
    }

    public int getReadTimeout() {
        return readTimeout;
    }

    public void setReadTimeout(int readTimeout) {
        this.readTimeout = readTimeout;
    }
}

HttpResponse.java

package com.kuaidaili.sdk;

import java.util.Vector;

/**
 * HTTP响应对象
 */
public class HttpResponse {

    String urlString;
    int defaultPort;
    String file;
    String host;
    String path;
    int port;
    String protocol;
    String query;
    String ref;
    String userInfo;
    String contentEncoding;
    int contentLength;
    String content;
    String contentType;
    int code;
    String message;
    String method;

    int connectTimeout;

    int readTimeout;

    Vector<String> contentCollection;

    public String getContent() {
        return content;
    }

    public String getContentType() {
        return contentType;
    }

    public int getCode() {
        return code;
    }

    public String getMessage() {
        return message;
    }

    public Vector<String> getContentCollection() {
        return contentCollection;
    }

    public String getContentEncoding() {
        return contentEncoding;
    }

    public String getMethod() {
        return method;
    }

    public int getConnectTimeout() {
        return connectTimeout;
    }

    public int getReadTimeout() {
        return readTimeout;
    }

    public String getUrlString() {
        return urlString;
    }

    public int getDefaultPort() {
        return defaultPort;
    }

    public String getFile() {
        return file;
    }

    public String getHost() {
        return host;
    }

    public String getPath() {
        return path;
    }

    public int getPort() {
        return port;
    }

    public String getProtocol() {
        return protocol;
    }

    public String getQuery() {
        return query;
    }

    public String getRef() {
        return ref;
    }

    public String getUserInfo() {
        return userInfo;
    }

}

httpclient

HttpClient-4.5.6

使用提示

  1. 此样例同时支持访问http和https网页
  2. 建议使用白名单访问(HttpClient在使用用户名密码会出现一定数量的认证失败)
  3. 运行环境要求 jdk >= 1.6
  4. 依赖包(点击下载):
    httpclient-4.5.6.jar
    httpcore-4.4.10.jar
    commons-codec-1.10.jar
    commons-logging-1.2.jar
package com.kuaidaili.sdk;

import java.net.URL;

import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * 使用httpclient请求代理服务器
 * 请求http和https网页均适用
 */
public class TestProxyHttpClient {

    private static String pageUrl1 = "http://dev.kdlapi.com/testproxy"; //要访问的目标网页 http
    private static String pageUrl2 = "https://dev.kdlapi.com/testproxy"; //要访问的目标网页 https
    private static String tunnelHost = "mytunnelhost"; //隧道服务器域名
    private static int tunnelPort = mytunnelport; //隧道服务器端口
    private static String username = "myusername"; //隧道id
    private static String password = "mypassword"; //密码

    public static void main(String[] args) throws Exception {
        CredentialsProvider credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(
                new AuthScope(proxyIp, proxyPort),
                new UsernamePasswordCredentials(username, password));
        CloseableHttpClient httpclient = HttpClients.custom()
                .setDefaultCredentialsProvider(credsProvider).build();
        try {
            URL url = new URL(pageUrl1);
            HttpHost target = new HttpHost(url.getHost(), url.getDefaultPort(), url.getProtocol());
            HttpHost proxy = new HttpHost(proxyIp, proxyPort);

            RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
            HttpGet httpget = new HttpGet(url.getPath());
            httpget.setConfig(config);
            httpget.addHeader("Accept-Encoding", "gzip"); //使用gzip压缩传输数据让访问更快

            System.out.println("Executing request " + httpget.getRequestLine() + " to " + target + " via " + proxy);

            CloseableHttpResponse response = httpclient.execute(target, httpget);
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                System.out.println(EntityUtils.toString(response.getEntity()));
            } finally {
                response.close();
            }
        } finally {
            httpclient.close();
        }

        try {
            URL url = new URL(pageUrl12);
            HttpHost target = new HttpHost(url.getHost(), url.getDefaultPort(), url.getProtocol());
            HttpHost proxy = new HttpHost(proxyIp, proxyPort);

            RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
            HttpGet httpget = new HttpGet(url.getPath());
            httpget.setConfig(config);
            httpget.addHeader("Accept-Encoding", "gzip"); //使用gzip压缩传输数据让访问更快

            System.out.println("Executing request " + httpget.getRequestLine() + " to " + target + " via " + proxy);

            CloseableHttpResponse response = httpclient.execute(target, httpget);
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                System.out.println(EntityUtils.toString(response.getEntity()));
            } finally {
                response.close();
            }
        } finally {
            httpclient.close();
        }
    }
}