大家好,欢迎来到IT知识分享网。
“捧腹网”页面结构分析
捧腹网M站地址: http://m.pengfu.com/
“捧腹网”网页源码分析
“捧腹网”数据列表请求URL分析
使用Jsoup解析网页
在实际开发中,我们需要用过异步任务,获取、解析网络数据,所以,在这里,我通过httpurlconnection来获取网页源码。
1.封装HTTP请求工具类
package com.lnyp.joke.http; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.HttpURLConnection; import java.net.URL; / * Http请求的工具类 * */ public class HttpUtils {
private static final int TIMEOUT_IN_MILLIONS = 10000; public interface CallBack {
void onRequestComplete(String result); } / * 异步的Get请求 * * @param urlStr * @param callBack */ public static void doGetAsyn(final String urlStr, final CallBack callBack) { new Thread() { public void run() { try { String result = doGet(urlStr); if (callBack != null) { callBack.onRequestComplete(result); } } catch (Exception e) { e.printStackTrace(); } } ; }.start(); } / * Get请求,获得返回数据 * * @param urlStr * @return * @throws Exception */ public static String doGet(String urlStr) { URL url = null; HttpURLConnection conn = null; InputStream is = null; ByteArrayOutputStream baos = null; try { url = new URL(urlStr); conn = (HttpURLConnection) url.openConnection(); conn.setReadTimeout(TIMEOUT_IN_MILLIONS); conn.setConnectTimeout(TIMEOUT_IN_MILLIONS); conn.setRequestMethod("GET"); conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("User-Agent", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"); if (conn.getResponseCode() == 200) { is = conn.getInputStream(); baos = new ByteArrayOutputStream(); int len = -1; byte[] buf = new byte[128]; while ((len = is.read(buf)) != -1) { baos.write(buf, 0, len); } baos.flush(); // System.out.print("str : " + baos.toString()); return baos.toString(); } else { throw new RuntimeException(" responseCode is not 200 ... "); } } catch (Exception e) { e.printStackTrace(); } finally { try { if (is != null) is.close(); } catch (IOException e) { } try { if (baos != null) baos.close(); } catch (IOException e) { } conn.disconnect(); } return null; } }
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
2.查询网页源码,转化为Document对象。
private void qryJokes() { final String url = "http://m.pengfu.com/index_1.html"; System.out.println(url); HttpUtils.doGetAsyn(url, new HttpUtils.CallBack() { @Override public void onRequestComplete(String result) { if (result == null) { return; } Document doc = Jsoup.parse(result); } }); }
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
3.通过Jsoup解析网页源码,封装列表数据
import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; / * 笑话工具类 */ public class JokeUtil { public List<JokeBean> getNewJokelist(Document doc) { //class等于list-item的div标签 Elements list_item_elements = doc.select("div.list-item"); List<JokeBean> jokeBeanList = new ArrayList<>(); if (list_item_elements.size() > 0) { for (int i = 0; i < list_item_elements.size(); i++) {
JokeBean jokeBean = new JokeBean(); Element list_item_element = list_item_elements.get(i); Elements head_name_elements = list_item_element.select("div.head-name"); if (head_name_elements.size() > 0) { Element head_name_element = head_name_elements.first(); if (head_name_element != null) { String userAvatar = head_name_element.select("img").first().attr("src"); String userName = head_name_element.select("a[href]").get(1).text(); //带有href属性的a元素 String lastTime = head_name_element.getElementsByClass("dp-i-b").first().text(); //带有href属性的a元素 String shareUrl = head_name_element.select("a[href]").get(1).attr("href"); jokeBean.setUserAvatar(userAvatar); jokeBean.setUserName(userName); jokeBean.setLastTime(lastTime); jokeBean.setShareUrl(shareUrl); } } Element con_img_elements = list_item_element.select("div").get(2); if (con_img_elements != null) { if (con_img_elements.select("img") != null) { Element img_element = con_img_elements.select("img").first(); JokeBean.DataBean dataBean = new JokeBean.DataBean(); if (img_element != null) { String showImg = img_element.attr("src"); String gifsrcImg = img_element.attr("gifsrc"); String width = img_element.attr("width"); String height = img_element.attr("height"); dataBean.setShowImg(showImg); dataBean.setGifsrcImg(gifsrcImg); dataBean.setWidth(width); dataBean.setHeight(height); } else { String content = con_img_elements.text().replaceAll(" ", "\n"); dataBean.setContent(content); } jokeBean.setDataBean(dataBean); } } Element tagwrap_clearfix_elements = list_item_element.select("div").get(3); if (tagwrap_clearfix_elements != null) { Elements clearfixs = tagwrap_clearfix_elements.select("a[href]"); //带有href属性的a元素 List<String> tags = new ArrayList<>(); for (int j = 0; j < clearfixs.size(); j++) {
String tag = clearfixs.get(j) != null ? clearfixs.get(j).text() : ""; tags.add(tag); } jokeBean.setTags(tags); } jokeBeanList.add(jokeBean); } } return jokeBeanList; } }
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
如果你迫不及待的想看源码,请前往https://github.com/zuiwuyuan/Joke查看。谢谢大家的支持。
版权声明:本文为博主原创文章,未经博主允许不得转载。 http://blog.csdn.net/zuiwuyuan/article/details/
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/102062.html