From b68a6dd5b89bcb49dfd16c04c733d54c12043eeb Mon Sep 17 00:00:00 2001 From: lihaitao Date: Sat, 10 Sep 2022 00:10:10 +0800 Subject: [PATCH] init project --- pom.xml | 88 +++ src/main/java/META-INF/MANIFEST.MF | 3 + .../java/com/hitoli/fetchPic/DownLoad.java | 166 +++++ .../java/com/hitoli/fetchPic/FindLink.java | 255 +++++++ .../java/com/hitoli/fetchPic/HttpUtils.java | 115 ++++ src/main/java/com/hitoli/fetchPic/Main.java | 417 ++++++++++++ .../java/com/hitoli/fetchPic/PictInfo.java | 47 ++ src/main/java/com/hitoli/fetchPic/Utils.java | 622 ++++++++++++++++++ 8 files changed, 1713 insertions(+) create mode 100644 pom.xml create mode 100644 src/main/java/META-INF/MANIFEST.MF create mode 100644 src/main/java/com/hitoli/fetchPic/DownLoad.java create mode 100644 src/main/java/com/hitoli/fetchPic/FindLink.java create mode 100644 src/main/java/com/hitoli/fetchPic/HttpUtils.java create mode 100644 src/main/java/com/hitoli/fetchPic/Main.java create mode 100644 src/main/java/com/hitoli/fetchPic/PictInfo.java create mode 100644 src/main/java/com/hitoli/fetchPic/Utils.java diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..1eb4e2a --- /dev/null +++ b/pom.xml @@ -0,0 +1,88 @@ + + + 4.0.0 + + com.hitoli.fetchPic + fetchPic + 1.0-SNAPSHOT + + + UTF-8 + UTF-8 + 1.8 + + 3.7.0 + + + + + org.apache.httpcomponents + httpmime + 4.5.3 + + + commons-codec + commons-codec + 1.9 + + + commons-logging + commons-logging + 1.2 + + + org.apache.httpcomponents + fluent-hc + 4.5.3 + + + org.apache.httpcomponents + httpclient-cache + 4.5.3 + + + org.apache.httpcomponents + httpclient-win + 4.5.3 + + + org.apache.httpcomponents + httpcore + 4.4.6 + + + org.jsoup + jsoup + 1.11.3 + + + net.sourceforge.htmlunit + htmlunit + 2.32 + + + org.apache.commons + commons-lang3 + 3.7 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler.version} + + ${java.version} + ${java.version} + + + + + + + \ No newline at end of file diff --git a/src/main/java/META-INF/MANIFEST.MF b/src/main/java/META-INF/MANIFEST.MF new file mode 100644 index 0000000..cf9d16a --- /dev/null +++ b/src/main/java/META-INF/MANIFEST.MF @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Main-Class: com.hitoli.fetchPic.Main + diff --git a/src/main/java/com/hitoli/fetchPic/DownLoad.java b/src/main/java/com/hitoli/fetchPic/DownLoad.java new file mode 100644 index 0000000..3e9dded --- /dev/null +++ b/src/main/java/com/hitoli/fetchPic/DownLoad.java @@ -0,0 +1,166 @@ +package com.hitoli.fetchPic; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.SocketTimeoutException; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpEntity; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; + +public class DownLoad { + + public static RequestConfig defaultRequestConfig = RequestConfig.custom() + .setSocketTimeout(5000) + .setConnectTimeout(5000) + .setConnectionRequestTimeout(5000) + .setStaleConnectionCheckEnabled(true) + .setRedirectsEnabled(true) + .setMaxRedirects(3) + .build(); + + public static CloseableHttpClient httpClient = HttpClients.custom(). + setDefaultRequestConfig(defaultRequestConfig).build(); + + public static void downloadPict(PictInfo pictInfo) { + + String url = pictInfo.getUrl(); + CloseableHttpResponse response = null; + OutputStream out = null; + InputStream in=null; + BufferedReader br=null; + byte buffer[] = new byte[1024]; + if(StringUtils.isNotEmpty(url)){ + try { + String suffix = url.substring(url.lastIndexOf(".")); + String temp = suffix.substring(1, suffix.length()).toUpperCase(); + + if (!(temp.equals("BMP") || temp.equals("JPG") || temp.equals("JPEG") || temp.equals("GIF") || + temp.equals("PNG") || temp.equals("WEBP"))) { //非图片的丢弃 + return; + } + String name = url.substring(url.lastIndexOf("/")+1, url.lastIndexOf(".")); + if (StringUtils.isEmpty(name)) { + name = String.valueOf(System.currentTimeMillis()); + } else { + name = Utils.specialSymbolRemoval(name + "_" + System.currentTimeMillis()); + } + HttpGet httpGet = new HttpGet(url); + httpGet.setConfig(defaultRequestConfig); + if (StringUtils.isNotEmpty(Main.refererUrl)) { + if (Main.refererUrl.equalsIgnoreCase("self")) { + httpGet.setHeader("referer", pictInfo.getHtmlUrl()); + } else { + httpGet.setHeader("referer", Main.refererUrl); + } + } + response = httpClient.execute(httpGet); + HttpEntity entity = response.getEntity(); + long imgSize = entity.getContentLength(); + if (imgSize < Main.imgMinSize*1024) { //默认图片小于1KB的丢弃 + throw new Exception("图片只有" + (imgSize/1024) + "KB,小于" + Main.imgMinSize + "KB"); + } + in = entity.getContent(); + String title = pictInfo.getTitle(); + if (StringUtils.isEmpty(title)) { + title = "other"; + } else { + title = Utils.specialSymbolRemoval(title); + } + File file = Utils.createImgFile(Utils.getUseFileNameShortSite(pictInfo.getSite()), title, name, suffix); + Utils.info("正在下载:" + url); + out = new FileOutputStream(file); + int index = 0; + while((index = in.read(buffer)) != -1){ + out.write(buffer,0,index); + } + out.flush(); + + Main.imgDownloaded.add(url); + } catch (Exception e) { + Utils.error("下载失败:" + url + " [" + e.getMessage() + "]"); + if (e instanceof SocketTimeoutException) { + Main.readTimeOutImgs.add(pictInfo); + } + } finally { + try { + if (br != null){ + br.close(); + } + if (out != null){ + out.close(); + } + if (in != null){ + in.close(); + } + if (response != null) { + response.close(); + } + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } + } + } + + } + + public static void downloadPict(List pictLinks){ + + if (null != pictLinks && !pictLinks.isEmpty()) { + if (Main.thread) { + for (int i=0; i pageLinks, List lazyPageLinks, int connectExceptionRetryCount) { + + boolean result = true; + if (!Main.pageSet.contains(url)) { //已分析过的连接不再分析 + Utils.info("开始分析url[" + url + "]中的可用连接"); + try { + Document document = HttpUtils.getInstance().getHtmlPageResponseAsDocument(url); + String site = Utils.getSite(url); + Elements elements = document.select("a"); + boolean findNext = false;//是否找到下一页 + Set allUrls = new HashSet();//当前访问url下的所有链接地址 + for (Element element : elements) { + String href = element.attr("href"); + if (!"".equals(href) && !"#".equals(href) && href.indexOf(".js") == -1 && href.indexOf(".css") == -1 + && href.indexOf("javascript") == -1) { + href = Utils.getFullPath(site, url, href); + if (StringUtils.isNotEmpty(Main.fixedUrlPrefix)) { + if (!href.startsWith(Main.fixedUrlPrefix)) { + continue; + } + } + if (href.equals(url) || Main.pageSet.contains(href)) { + continue; + } + if (!Utils.checkSite(href)) { + continue; + } + Elements imgs = element.select("img"); + boolean discard = false; + for (Element img : imgs) { + //A标签的href地址和图片地址一致,说明是图片地址,不用再对该地址进行分析 + if (Utils.getFullPath(site, url, img.attr("src")).equals(href)) { + discard = true; + break; + } + } + if (discard) { + continue; + } + allUrls.add(href); + String text = element.text().trim(); + String title = element.attr("title"); + if (Utils.isNextPageButton(text) || Utils.isNextPageButton(title)) { //始终优先处理下一页的内容 + pageLinks.add(1, href); + findNext = true; + } else { + //通过父元素的class判断url是否是头布局中的链接(头布局中的链接延迟扫描) + boolean head = false; + Elements parents = element.parents(); + for (Element parent : parents) { + for (String className : parent.classNames()) { + if (className.equalsIgnoreCase("head") || + className.equalsIgnoreCase("header") || + className.equalsIgnoreCase("logo")) { + head = true; + break; + } + if (head) { + break; + } + } + } + if (head || imgs.isEmpty() || Utils.isSite(href)) { //非图片链接延迟扫描 + if (!lazyPageLinks.contains(href)) { + lazyPageLinks.add(href); + } + } else { + if (!pageLinks.contains(href)) { + pageLinks.add(href); + } + } + } + } + } + /** + * 通过比对url/到点之间的数字,大于当前页数字,并且小于所有获取的数字则是下一页 + * ( + * 如xxx/xxx_0.html,xxx/xxx_1.html,xxx/xxx_2.html, + * 当前页是xxx/xxx_0.html + * 获取xxx_0,xxx_1,xxx_2 + * 获取数字0,1,2 + * 比0大的数中最小的为下一页 + * ) + */ + if (!findNext && !allUrls.isEmpty()) { + String _urlStart = url.substring(0, url.lastIndexOf("/")+1); + String _urlEnd = url.substring(url.lastIndexOf("/")+1, url.lastIndexOf(".")); + String urlNumberStr = ""; + for(int i=0; i<_urlEnd.length(); i++){ + if(_urlEnd.charAt(i) >= 48 && _urlEnd.charAt(i) <= 57) { + urlNumberStr += _urlEnd.charAt(i); + } + } + Long urlNumber = Long.valueOf(urlNumberStr); + Long nextPageNumber = 0l; + String nextPageUrl = ""; + for (String s : allUrls) { + if (!url.equalsIgnoreCase(s) && s.indexOf(_urlStart) != -1) { + String _sEnd = s.substring(s.lastIndexOf("/")+1, s.lastIndexOf(".")); + String _sNumberStr = ""; + for(int i=0; i<_sEnd.length(); i++) { + if (_sEnd.charAt(i) >= 48 && _sEnd.charAt(i) <= 57) { + _sNumberStr += _sEnd.charAt(i); + } + } + Long _sNumber = Long.valueOf(_sNumberStr); + if (nextPageUrl == "") { + if (_sNumber.intValue() > urlNumber) { + nextPageNumber = _sNumber; + nextPageUrl = s; + } + } else { + if (_sNumber < nextPageNumber) { + nextPageNumber = _sNumber; + nextPageUrl = s; + } + } + } + } + if (nextPageUrl != "") { + pageLinks.add(1, nextPageUrl); + findNext = true; + } + } + } catch (Exception e) { + if (connectExceptionRetryCount <= 0) { + Utils.error("无效地址:" + url); + result = false; + } else { + Utils.error("重试访问地址:" + url + "第" + (5 - connectExceptionRetryCount + 1) + "次"); + connectExceptionRetryCount--; + return addPageLink(url, pageLinks, lazyPageLinks, connectExceptionRetryCount); + } + } + Utils.info("分析url[" + url + "]中的可用连接结束"); + Main.pageSet.add(url); + } + + return result; + + } + + /** + * 找出url下的所有图片连接 + * @param url + * @param pictInfos + * @param nextPageUrl + * @param connectExceptionRetryCount 重试次数 + */ + public static boolean addPictLink(String url, List pictInfos, String nextPageUrl, int connectExceptionRetryCount) { + + boolean result = true; + if (null == nextPageUrl) { + nextPageUrl = ""; + } + Utils.info("开始分析url[" + url + "]中的可用图片连接"); + try { + Document document = HttpUtils.getInstance().getHtmlPageResponseAsDocument(url); + String site = Utils.getSite(url); + Elements elements = document.select("img"); + Element head = document.head(); + Elements titles = head.getElementsByTag("title"); + String title = ""; + if (null != titles && !titles.isEmpty()) { + title = titles.get(0).text(); + if (StringUtils.isNotEmpty(title)) { + if (null != Main.pageTitlefilters && !Main.pageTitlefilters.isEmpty()) { + for (String filter : Main.pageTitlefilters) { + title = title.replaceAll(filter, ""); + } + } + } + } + for (Element element : elements) { + String imgAttrName = "src"; + if (null != Main.imgSrcRepletTags && !Main.imgSrcRepletTags.isEmpty()) { + for (String tag : Main.imgSrcRepletTags) { + if (element.hasAttr(tag)) { + imgAttrName = tag; + break; + } + } + } + String src = element.attr(imgAttrName); + if (StringUtils.isNotEmpty(src) && src.toUpperCase().indexOf("JAVASCRIPT") == -1) { + if (!Main.imgNamefilters.isEmpty()) { //检查是否存在与要丢弃的图片名称中 + boolean discard = false; + try { + String temp = src.substring(src.lastIndexOf("/") + 1, + src.lastIndexOf(".")).toUpperCase(); + for (String imgNamefilter : Main.imgNamefilters) { + if (temp.equals(imgNamefilter)) { + discard = true; + break; + } + } + } catch (Exception e) { + } + if (discard) { + continue; + } + } + src = Utils.getFullPath(site, url, src); + Element parent = element.parent(); + if (parent.tagName().toUpperCase().equals("A")) { + //一般A标签下的img都是预览图片,但有些网站的a标签下的图片和a标签href地址一致,不是预览图片 + //有的图片a标签的href是下一页地址 + String parentUrl = parent.attr("href"); + String href = Utils.getFullPath(site, url, parentUrl); + if (StringUtils.isNotEmpty(parentUrl) && !href.equals(src) && !nextPageUrl.equals(href)) { + continue; + } + } + if (!Main.imgDownloaded.contains(src)) { + pictInfos.add(new PictInfo(site, title, src, url)); + } + } + } + } catch (Exception e) { + if (connectExceptionRetryCount <= 0) { + Utils.error("无效地址:" + url); + result = false; + } else { + Utils.error("重试访问地址:" + url + "第" + (5 - connectExceptionRetryCount + 1) + "次"); + connectExceptionRetryCount--; + addPictLink(url, pictInfos, nextPageUrl, connectExceptionRetryCount); + } + } + Utils.info("分析url[" + url + "]中的可用图片连接结束"); + + return result; + } + +} diff --git a/src/main/java/com/hitoli/fetchPic/HttpUtils.java b/src/main/java/com/hitoli/fetchPic/HttpUtils.java new file mode 100644 index 0000000..ba460eb --- /dev/null +++ b/src/main/java/com/hitoli/fetchPic/HttpUtils.java @@ -0,0 +1,115 @@ +package com.hitoli.fetchPic; + +import com.gargoylesoftware.htmlunit.BrowserVersion; +import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlPage; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +/** + *
+ * Http工具,包含:
+ * 高级http工具(使用net.sourceforge.htmlunit获取完整的html页面,即完成后台js代码的运行)
+ * 
+ */ +public class HttpUtils { + + private WebClient webClient; + + /** + * 等待异步JS执行时间 + */ + private int waitForBackgroundJavaScript; + + private static HttpUtils httpUtils; + + private HttpUtils() { + } + + /** + * 获取实例 + * + * @return + */ + public static HttpUtils getInstance() { + if (httpUtils == null) { + httpUtils = new HttpUtils(); + } + return httpUtils; + } + + /** + * + * @param browserTimeout 浏览器请求超时时间 + * @param jsTimeout js请求超时时间 + * @param jsEnabled 是否启用js + * @param waitForBackgroundJavaScript 等待异步JS执行时间 + */ + public void initWebClient(int browserTimeout, int jsTimeout, boolean jsEnabled, int waitForBackgroundJavaScript) { + this.waitForBackgroundJavaScript = waitForBackgroundJavaScript; + + webClient = new WebClient(BrowserVersion.CHROME); + + webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常 + webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常 + webClient.getOptions().setActiveXNative(false); + webClient.getOptions().setCssEnabled(false);//是否启用CSS + + if (jsEnabled) { + webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS + webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX + } else { + webClient.getOptions().setJavaScriptEnabled(false); + } + + webClient.getOptions().setTimeout(browserTimeout);//设置“浏览器”的请求超时时间 + webClient.setJavaScriptTimeout(jsTimeout);//设置JS执行的超时时间 + } + + /** + * 将网页返回为解析后的文档格式 + * + * @param html + * @return + * @throws Exception + */ + public static Document parseHtmlToDoc(String html) throws Exception { + return removeHtmlSpace(html); + } + + private static Document removeHtmlSpace(String str) { + Document doc = Jsoup.parse(str); + String result = doc.html().replace(" ", ""); + return Jsoup.parse(result); + } + + /** + * 获取页面文档字串(等待异步JS执行) + * + * @param url 页面URL + * @return + * @throws Exception + */ + public String getHtmlPageResponse(String url) throws Exception { + HtmlPage page; + try { + page = webClient.getPage(url); + } catch (Exception e) { + throw e; + } + webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程 + return page.asXml(); + } + + /** + * 获取页面文档Document对象(等待异步JS执行) + * + * @param url 页面URL + * @return + * @throws Exception + */ + public Document getHtmlPageResponseAsDocument(String url) throws Exception { + return parseHtmlToDoc(getHtmlPageResponse(url)); + } +} diff --git a/src/main/java/com/hitoli/fetchPic/Main.java b/src/main/java/com/hitoli/fetchPic/Main.java new file mode 100644 index 0000000..d9415fd --- /dev/null +++ b/src/main/java/com/hitoli/fetchPic/Main.java @@ -0,0 +1,417 @@ +package com.hitoli.fetchPic; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + + +public class Main { + + public static Set sites = new HashSet(); //所有网站根(防止访问地址越界) + public static Set transboundarySites = new HashSet(); //可越界的网站根 + public static Set pageSet = new HashSet(); //所有已爬过的地址 + public static Set readTimeOutImgs = new HashSet(); //所有读取超时的图片 + public static Set imgDownloaded = new HashSet(); //已经下载的图片地址 + public static Set imgDownloadedDirName = new HashSet();//已经下载的图片目录名称 + public static String home = null; + public static String downLoadDir = null; //下载目录 + public static String siteData = null; //网站可以扫描的域名地址记录文件(防止越界) + public static String downLoadedImgData = null; //已经下载的图片地址记录文件 + public static String downloadedImgDirNameData = null; //已经下载的图片目录名称记录文件 + public static String allowDifferentCharacters = "0";//比对下载图片的title和已经下载的图片目录名称记录允许相差的字符数(用于检查图片存放路径的目录是否已经存在) + public static String readTimeOutImgData = null; //所有读取超时的图片地址记录文件 + public static String allUrlData = null; //所有已爬过的地址记录文件 + public static String currentUrlData = null; //当前正在处理的地址记录文件 + public static String lazyUrlData = null; //延迟处理的地址记录文件 + public static String commandFile = null;//命令文件(每次循环pageLinks检查一次) + public static long imgMinSize = 1; //下载图片最小单位(KB) + public static List nextPageNames = Arrays.stream( + new String[]{"下一页", "下一篇", "下一章", "后", "NEXT", ">", ">>", ">>>"} + ).collect(Collectors.toList()); //下一页按钮中的text + public static List pageTitlefilters = Arrays.stream( + new String[]{"(\\第\\d+\\页)"} + ).collect(Collectors.toList()); //title中需要过滤的关键字,过滤全部关键字 + public static List imgNamefilters = Arrays.stream( + new String[]{"LOGO", "FAVICON"} + ).collect(Collectors.toList()); //下载图片中需要丢弃的图片名称 + public static List imgSrcRepletTags = Arrays.stream( + new String[]{"original"} + ).collect(Collectors.toList()); //下载图片的链接非src标签,如果存在此list中的标签名,则获取对应标签的值为下载链接 + public static String refererUrl = "self"; //下载图片时需要模拟的来源url(如有的情况下),默认为self(发现图片的html地址) + public static String fixedUrlPrefix = ""; //只扫描固定前缀的url + public static int browserTimeout = 5000;//请求超时时间,默认5秒 + public static int jsTimeout = 1000;//请求超时时间,默认1秒 + public static int waitForBackgroundJavaScript = 1;//等待异步JS执行时间,默认1秒 + public static boolean jsEnabled = true;//是否启用js + public static boolean thread = false; //是否多线程下载 + public static int threadSize = 5; //一次开启的线程数 + public static int threadSleep = 1; //开满线程数后等待多少秒 + public static boolean stop = false; //是否退出 + public static boolean autoSaveMemoryData = true; //自动保存内存数据到文件 + public static int autoSaveMemoryDataInterval = 3; //保存间隔(分钟) + public static Long lastAutoSaveMemoryDataTime = null; //最后一次保存时间 + + public static void main(String[] args) { + + HttpUtils.getInstance().initWebClient(browserTimeout, jsTimeout, jsEnabled, waitForBackgroundJavaScript); + List pageLinks = new ArrayList(); + List lazyPageLinks = new ArrayList<>(); + List pictInfos = new ArrayList(); + init(args, pageLinks, lazyPageLinks, pictInfos); + + int networkExceptionRetryCount = 5;//连续访问url失败次数 + Set failUrls = new HashSet(); + while(pageLinks.size() > 0){ + int connectExceptionRetryCount = 5;//重复访问同一url失败次数 + + String url = pageLinks.get(0); + if (StringUtils.isEmpty(url)) { + pageLinks.remove(0); + continue; + } + if (StringUtils.isNotEmpty(fixedUrlPrefix)) { + if (!url.startsWith(fixedUrlPrefix)) { + pageLinks.remove(0); + continue; + } + } + + //分析页面所有连接 + boolean result = FindLink.addPageLink(url, pageLinks, lazyPageLinks, connectExceptionRetryCount); + //分析页面所有图片 + if (result) { + networkExceptionRetryCount = 5;//访问成功,恢复次数 + String nextPageUrl = ""; + if (pageLinks.size() >= 2) { + nextPageUrl = pageLinks.get(1); + } + result = FindLink.addPictLink(url, pictInfos, nextPageUrl, connectExceptionRetryCount); + if (!result) { + failUrls.add(url); + networkExceptionRetryCount--; + } else { + networkExceptionRetryCount = 5;//访问成功,恢复次数 + } + } else { + failUrls.add(url); + networkExceptionRetryCount--; + } + if (networkExceptionRetryCount <= 0) { + Utils.error("网络可能出现问题,连续访问5次不同url失败"); + pageLinks.remove(0); + pageLinks.addAll(0, failUrls); + pageSet.removeAll(failUrls); + writeMemoryDataToFile(pageLinks, lazyPageLinks); + writeDefaultCommand(); + System.exit(0); + } + if (pictInfos.size() > 0) { + Utils.info("url[" + url + "]中找到" + pictInfos.size() + "个可用图片连接"); + DownLoad.downloadPict(pictInfos); + } else { + Utils.info("url[" + url + "]中找到0可用图片连接"); + } + pageLinks.remove(0); + + //图片链接扫描完后取非图片链接继续扫描 + getLazyUrl(pageLinks, lazyPageLinks); + + //检查命令文件,如果stop等于true或者pagelinks为空则停止程序 + checkAndExecutCommand(pageLinks, lazyPageLinks); + + //检查是否自动保存内存数据 + checkAutoSaveMemoryData(pageLinks, lazyPageLinks); + } + + } + + private static void init(String[] prams, List pageLinks, List lazyPageLinks, List pictInfos) { + + if (thread) { + sites = Collections.synchronizedSet(new HashSet()); + pageSet = Collections.synchronizedSet(new HashSet()); + imgDownloaded = Collections.synchronizedSet(new HashSet()); + readTimeOutImgs = Collections.synchronizedSet(new HashSet()); + pageLinks = Collections.synchronizedList(new ArrayList()); + lazyPageLinks = Collections.synchronizedList(new ArrayList()); + pictInfos = Collections.synchronizedList(new ArrayList()); + } + + //图片保存路径 + home = ""; + if (null != prams && prams.length > 0) { + home = prams[0]; + } + System.out.println("HOME路径为:" + home); + if (!Utils.checkFilePathExists(home)) { + System.out.println("HOME路径不存在,请手动创建"); + System.exit(1); + } + + downLoadDir = home.endsWith(File.separator) ? (home + "pict") : (home + File.separator + "pict"); + //创建文件下载目录 + Utils.createDir(downLoadDir); + siteData = downLoadDir + File.separator + "siteData"; + downLoadedImgData = downLoadDir + File.separator + "downLoadedImgData"; + downloadedImgDirNameData = downLoadDir + File.separator + "downloadedImgDirNameData"; + readTimeOutImgData = downLoadDir + File.separator + "readTimeOutImgData"; + allUrlData = downLoadDir + File.separator + "allUrlData"; + currentUrlData = downLoadDir + File.separator + "currentUrlData"; + lazyUrlData = downLoadDir + File.separator + "lazyUrlData"; + commandFile = downLoadDir + File.separator + "command"; + //创建网站域名地址记录文件(不存在才创建) + Utils.createFile(siteData); + //创建下载记录文件(不存在才创建) + Utils.createFile(downLoadedImgData); + //创建已经下载的图片目录名称文件(不存在才创建) + Utils.createFile(downloadedImgDirNameData); + //创建读取超时记录文件(不存在才创建) + Utils.createFile(readTimeOutImgData); + //创建所有已爬过的地址记录文件(不存在才创建) + Utils.createFile(allUrlData); + //创建当前需要处理的地址记录文件(不存在才创建) + Utils.createFile(currentUrlData); + //创建延迟处理的地址记录文件(不存在才创建) + Utils.createFile(lazyUrlData); + //创建命令文件(不存在才创建) + Utils.createFile(commandFile); + //写入初始命令 + writeDefaultCommand(); + //读取网站域名地址记录到内存中 + Utils.readFileDataToCollection(siteData, sites); + //读取已经下载的文件记录到内存中 + Utils.readFileDataToCollection(downLoadedImgData, imgDownloaded); + //读取已经下载的图片目录名称到内存中 + Utils.readFileDataToCollection(downloadedImgDirNameData, imgDownloadedDirName); + //读取超时文件记录到内存中 + Utils.readFilePictInfoDataToCollection(readTimeOutImgData, readTimeOutImgs); + //读取所有已爬过的地址记录到内存中 + Utils.readFileDataToCollection(allUrlData, pageSet); + + //读取当前需要处理的连接地址记录到内存中 + Utils.readFileDataToCollection(currentUrlData, pageLinks); + //读取延迟处理的连接地址记录到内存中 + Utils.readFileDataToCollection(lazyUrlData, lazyPageLinks); + //图片链接扫描完后取非图片链接继续扫描 + getLazyUrl(pageLinks, lazyPageLinks); + if (pageLinks.isEmpty()) { + if (null == prams || prams.length < 2) { + Utils.error("请输入抓取地址"); + System.exit(1); + } else { + pageLinks.add(prams[1]); + sites.add(Utils.getSite(prams[1])); + Utils.writeStringToFile(siteData, "", false); + Utils.writeCollectionToFile(siteData, sites); + } + } + + if (null != prams && prams.length >= 3) { + Integer _imgMinSize = null; + try { + _imgMinSize = Integer.valueOf(prams[2]); + } catch (Exception e) { + Utils.error("抓取最小图片大小输入有误,必须大于等于0"); + } + if (_imgMinSize == null || _imgMinSize < 0) { + Utils.error("抓取最小图片大小输入有误,必须大于等于0"); + } else { + imgMinSize = _imgMinSize; + Utils.error("抓取最小图片大小为" + imgMinSize + "KB"); + } + } + + if (autoSaveMemoryData) { //首次启动把启动时间作为最后一次自动保存时间 + lastAutoSaveMemoryDataTime = System.currentTimeMillis(); + } + + if (!readTimeOutImgs.isEmpty()) { + Utils.info("重新下载上次超时的图片"); + List _pictInfos = readTimeOutImgs.stream().map(p -> { + return new PictInfo(p.getSite(), p.getTitle(), p.getUrl(), p.getHtmlUrl()); + }).collect(Collectors.toList()); + readTimeOutImgs.clear(); + DownLoad.downloadPict(_pictInfos); + } + + } + + private static void getLazyUrl(List pageLinks, List lazyPageLinks) { + //图片链接扫描完后取非图片链接继续扫描 + if (pageLinks.isEmpty() && !lazyPageLinks.isEmpty()) { + int index = 0; + String lazyUrl = lazyPageLinks.get(index); + if (Utils.isSite(lazyUrl)) { + if (lazyPageLinks.size() > 1) { + index = 1; + lazyUrl = lazyPageLinks.get(index); + } + } + pageLinks.add(lazyUrl); + lazyPageLinks.remove(index); + } + } + + private static void checkAndExecutCommand(List pageLinks, List lazyPageLinks) { + + readCommandToMemory("stop"); + + if (stop || pageLinks.isEmpty()) { + if (pageLinks.isEmpty()) { + Utils.info("无法找到新的url,抓取图片结束"); + } + writeMemoryDataToFile(pageLinks, lazyPageLinks); + writeDefaultCommand(); + System.exit(0); + } + + } + + private static void checkAutoSaveMemoryData(List pageLinks, List lazyPageLinks) { + + if (autoSaveMemoryData) { + if (null == lastAutoSaveMemoryDataTime) { + writeMemoryDataToFile(pageLinks, lazyPageLinks); + writeDefaultCommand(); + lastAutoSaveMemoryDataTime = System.currentTimeMillis(); + } else if (System.currentTimeMillis() > (lastAutoSaveMemoryDataTime + (autoSaveMemoryDataInterval*60*1000))) { + writeMemoryDataToFile(pageLinks, lazyPageLinks); + writeDefaultCommand(); + lastAutoSaveMemoryDataTime = System.currentTimeMillis(); + } + } + + } + + private static void writeDefaultCommand() { + + readCommandToMemory(); + + Utils.writeStringToFile(commandFile, "", false); + Utils.writeStringToFile(commandFile, "stop=false"); + Utils.writeStringToFile(commandFile, "imgMinSize=" + imgMinSize); + Utils.writeStringToFile(commandFile, "nextPageNames=" + StringUtils.join(nextPageNames, ",")); + Utils.writeStringToFile(commandFile, "pageTitlefilters=" + StringUtils.join(pageTitlefilters, ",")); + Utils.writeStringToFile(commandFile, "imgNamefilters=" + StringUtils.join(imgNamefilters, ",")); + Utils.writeStringToFile(commandFile, "imgSrcRepletTags=" + StringUtils.join(imgSrcRepletTags, ",")); + Utils.writeStringToFile(commandFile, "allowDifferentCharacters=" + allowDifferentCharacters); + Utils.writeStringToFile(commandFile, "browserTimeout=" + browserTimeout); + Utils.writeStringToFile(commandFile, "jsTimeout=" + jsTimeout); + Utils.writeStringToFile(commandFile, "jsEnabled=" + jsEnabled); + Utils.writeStringToFile(commandFile, "waitForBackgroundJavaScript=" + waitForBackgroundJavaScript); + Utils.writeStringToFile(commandFile, "refererUrl=" + refererUrl); + Utils.writeStringToFile(commandFile, "transboundarySites=" + StringUtils.join(transboundarySites, ",")); + Utils.writeStringToFile(commandFile, "fixedUrlPrefix=" + fixedUrlPrefix); + Utils.writeStringToFile(commandFile, "thread=" + thread); + Utils.writeStringToFile(commandFile, "threadSize=" + threadSize); + Utils.writeStringToFile(commandFile, "threadSleep=" + threadSleep); + Utils.writeStringToFile(commandFile, "autoSaveMemoryData=" + autoSaveMemoryData); + Utils.writeStringToFile(commandFile, "autoSaveMemoryDataInterval=" + autoSaveMemoryDataInterval); + + } + + private static void readCommandToMemory() { + readCommandToMemory(Collections.EMPTY_LIST); + } + + private static void readCommandToMemory(String key) { + readCommandToMemory(Arrays.asList(new String[] {key})); + } + + private static void readCommandToMemory(List keys) { + + Set command = new HashSet(); //命令列表 + Utils.readFileDataToCollection(commandFile, command); + for (String c : command) { + String[] _c = c.split("="); + if (_c.length != 2) { + continue; + } + if (null != keys && !keys.isEmpty()) { + boolean exists = false; + for (String key : keys) { + if (_c[0].equals(key)) { + exists = true; + break; + } + } + if (!exists) { + continue; + } + } + try { + if (_c[0].equals("stop")) { + stop = Boolean.valueOf(_c[1]); + } else if (_c[0].equals("imgMinSize")) { + imgMinSize = Long.valueOf(_c[1]); + } else if (_c[0].equals("nextPageNames")) { + nextPageNames = Arrays.stream(_c[1].split(",")).collect(Collectors.toList()); + } else if (_c[0].equals("pageTitlefilters")) { + pageTitlefilters = Arrays.stream(_c[1].split(",")).collect(Collectors.toList()); + } else if (_c[0].equals("imgNamefilters")) { + imgNamefilters = Arrays.stream(_c[1].split(",")).collect(Collectors.toList()); + } else if (_c[0].equals("imgSrcRepletTags")) { + imgSrcRepletTags = Arrays.stream(_c[1].split(",")).collect(Collectors.toList()); + } else if (_c[0].equals("allowDifferentCharacters")) { + allowDifferentCharacters = String.valueOf(_c[1]); + } else if (_c[0].equals("browserTimeout")) { + browserTimeout = Integer.valueOf(_c[1]); + } else if (_c[0].equals("jsTimeout")) { + jsTimeout = Integer.valueOf(_c[1]); + } else if (_c[0].equals("jsEnabled")) { + jsEnabled = Boolean.valueOf(_c[1]); + } else if (_c[0].equals("waitForBackgroundJavaScript")) { + waitForBackgroundJavaScript = Integer.valueOf(_c[1]); + } else if (_c[0].equals("refererUrl")) { + refererUrl = _c[1]; + } else if (_c[0].equals("transboundarySites")) { + transboundarySites = Arrays.stream(_c[1].split(",")).collect(Collectors.toSet()); + } else if (_c[0].equals("fixedUrlPrefix")) { + fixedUrlPrefix = _c[1]; + } else if (_c[0].equals("thread")) { + thread = Boolean.valueOf(_c[1]); + } else if (_c[0].equals("threadSize")) { + threadSize = Integer.valueOf(_c[1]); + } else if (_c[0].equals("threadSleep")) { + threadSleep = Integer.valueOf(_c[1]); + } else if (_c[0].equals("autoSaveMemoryData")) { + autoSaveMemoryData = Boolean.valueOf(_c[1]); + } else if (_c[0].equals("autoSaveMemoryDataInterval")) { + autoSaveMemoryDataInterval = Integer.valueOf(_c[1]); + } + } catch (Exception e) { + e.printStackTrace(); + Utils.error("参数错误" + e.getMessage()); + } + } + + } + + private static void writeMemoryDataToFile(List pageLinks, List lazyPageLinks) { + Utils.info("正在保存重要的分析数据,请不要强制退出"); + //先用一条空数据把记录覆盖,再用新数据写入到记录文件中 + Utils.writeStringToFile(siteData, "", false); + Utils.writeCollectionToFile(siteData, sites); + Utils.writeStringToFile(allUrlData, "", false); + Utils.writeCollectionToFile(allUrlData, pageSet); + Utils.writeStringToFile(currentUrlData, "", false); + Utils.writeCollectionToFile(currentUrlData, pageLinks); + Utils.writeStringToFile(lazyUrlData, "", false); + Utils.writeCollectionToFile(lazyUrlData, lazyPageLinks); + Utils.writeStringToFile(downLoadedImgData, "", false); + Utils.writeCollectionToFile(downLoadedImgData, imgDownloaded); + Utils.writeStringToFile(downloadedImgDirNameData, "", false); + Utils.writeCollectionToFile(downloadedImgDirNameData, imgDownloadedDirName); + Utils.writeStringToFile(readTimeOutImgData, "", false); + Utils.writePictInfoCollectionToFile(readTimeOutImgData, readTimeOutImgs); + } + +} + diff --git a/src/main/java/com/hitoli/fetchPic/PictInfo.java b/src/main/java/com/hitoli/fetchPic/PictInfo.java new file mode 100644 index 0000000..9353041 --- /dev/null +++ b/src/main/java/com/hitoli/fetchPic/PictInfo.java @@ -0,0 +1,47 @@ +package com.hitoli.fetchPic; + +public class PictInfo { + private String site; + private String url; + private String title; + private String htmlUrl; + + public PictInfo(String site, String title, String url, String htmlUrl) { + this.site = null == site ? "" : site; + this.title = null == title ? "" : title; + this.url = null == url ? "" : url; + this.htmlUrl = null == htmlUrl ? "" : htmlUrl; + } + + public String getSite() { + return site; + } + + public void setSite(String site) { + this.site = site; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getHtmlUrl() { + return htmlUrl; + } + + public void setHtmlUrl(String htmlUrl) { + this.htmlUrl = htmlUrl; + } +} \ No newline at end of file diff --git a/src/main/java/com/hitoli/fetchPic/Utils.java b/src/main/java/com/hitoli/fetchPic/Utils.java new file mode 100644 index 0000000..a75259f --- /dev/null +++ b/src/main/java/com/hitoli/fetchPic/Utils.java @@ -0,0 +1,622 @@ +package com.hitoli.fetchPic; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.logging.FileHandler; +import java.util.logging.Formatter; +import java.util.logging.Level; +import java.util.logging.LogRecord; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +public class Utils { + + private static Logger loginfo = null; + private static Logger logerror = null; + + static class MyLogHander extends Formatter { + @Override + public String format(LogRecord record) { + return LocalDateTime.ofEpochSecond(record.getMillis()/1000, 0, ZoneOffset.ofHours(8)) + + " " + record.getLevel() + " : " + record.getMessage()+"\n"; + } + } + + private static void createLogger() { + loginfo = Logger.getLogger("fetchPicLog-info"); + loginfo.setLevel(Level.ALL); + logerror = Logger.getLogger("fetchPicLog-error"); + logerror.setLevel(Level.WARNING); +// ConsoleHandler consoleHandler = new ConsoleHandler(); +// consoleHandler.setLevel(Level.ALL); +// loginfo.addHandler(consoleHandler); +// logerror.addHandler(consoleHandler); + FileHandler logInfoFileHandler = null; + FileHandler logErrorFileHandler = null; + try { + logInfoFileHandler = new FileHandler(Main.downLoadDir + File.separator + "fetchPicLog-info.log"); + logErrorFileHandler = new FileHandler(Main.downLoadDir + File.separator + "fetchPicLog-error.log"); + } catch (IOException e) { + e.printStackTrace(); + } + if (null != logInfoFileHandler) { + logInfoFileHandler.setLevel(Level.INFO); + logInfoFileHandler.setFormatter(new MyLogHander()); + loginfo.addHandler(logInfoFileHandler); + } else { + System.out.println("创建信息日志文件失败"); + } + if (null != logErrorFileHandler) { + logErrorFileHandler.setLevel(Level.WARNING); + logErrorFileHandler.setFormatter(new MyLogHander()); + logerror.addHandler(logErrorFileHandler); + } else { + System.out.println("创建错误日志文件失败"); + } + } + + public static void info(String msg) { + if (null == loginfo) { + createLogger(); + } + loginfo.info(msg); + } + + public static void error(String msg) { + if (null == logerror) { + createLogger(); + } + logerror.warning(msg); + } + + /** + * 创建目录 + * @param dir + */ + public static void createDir(String dir){ + + File file = new File(dir); + if(!file.exists()){ + file.mkdir(); + } + + } + + /** + * 创建新的图片目录 + * @param dir + */ + public static void createNewImgDir(String dir){ + + File file = new File(dir); + if(!file.exists()){ + file.mkdir(); + Main.imgDownloadedDirName.add(file.getName() + "#####" + file.getPath()); + } + + } + + /** + * 读取目录下的目录 + * @param dir + * @return + */ + public static List getDirs(String dir) { + File file = new File(dir); + if(file.exists()){ + return Arrays.asList(file.listFiles()).stream().filter(f -> !f.isDirectory()).collect(Collectors.toList()); + } + return Collections.emptyList(); + } + + /** + * 删除目录 + * @param dir + */ + public static void delDir(String dir){ + + File file = new File(dir); + if(file.exists() && file.isDirectory()){ + for (File f : file.listFiles()) { + delFile(f.getPath()); + } + file.delete(); + } + + } + + /** + * 创建文件 + * @param fielPath + */ + public static void createFile(String fielPath){ + + File file = new File(fielPath); + if(!file.exists()){ + try { + file.createNewFile(); + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } + } + + } + + /** + * 删除文件 + * @param filePath + */ + public static void delFile(String filePath){ + + File file = new File(filePath); + if(file.exists() && file.isFile()){ + file.delete(); + } + + } + + /** + * 检查指定路径是否存在 + * @param filePath + * @return + */ + public static boolean checkFilePathExists(String filePath) { + return new File(filePath).exists(); + } + + /** + * 读取文件信息到集合中(一行一条数据) + * @param filePath + * @param collection + */ + public static void readFileDataToCollection(String filePath, Collection collection) { + + InputStreamReader in = null; + BufferedReader br = null; + try { + in = new InputStreamReader(new FileInputStream(new File(filePath))); + br = new BufferedReader(in); + String line; + while ((line = br.readLine()) != null) { + if (StringUtils.isNotEmpty(line)) { + collection.add(line); + } + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } finally { + closeIO(in, br, null, null); + } + + } + + /** + * 读取文件图片信息对象到集合中(一行一条数据) + * @param filePath + * @param collection + */ + public static void readFilePictInfoDataToCollection(String filePath, Collection collection) { + + InputStreamReader in = null; + BufferedReader br = null; + try { + in = new InputStreamReader(new FileInputStream(new File(filePath))); + br = new BufferedReader(in); + String line; + while ((line = br.readLine()) != null) { + if (StringUtils.isNotEmpty(line)) { + String[] datas = line.split("#####"); + if (datas.length >= 3) { + PictInfo pictInfo = new PictInfo(datas[0], datas[1], datas[2], (datas.length == 3 ? null : datas[3])); + collection.add(pictInfo); + } + } + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } finally { + closeIO(in, br, null, null); + } + + } + + /** + * 把字符串写入文件(一行一条),默认追加写入 + * @param filePath + * @param str + */ + public static void writeStringToFile(String filePath, String str) { + writeStringToFile(filePath, str, true); + } + + /** + * 把集合中的数据写到文件中(一行一条) + * @param filePath + * @param collection + */ + public static void writeCollectionToFile(String filePath, Collection collection) { + + if (null != collection && !collection.isEmpty()) { + FileWriter fw = null; + BufferedWriter out = null; + try { + fw = new FileWriter(new File(filePath), true); + out = new BufferedWriter(fw); + for (String str : collection) { + out.write(str += "\r\n"); + } + out.flush(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } finally { + closeIO(null, null, fw, out); + } + } + + } + + /** + * 把集合中的图片对象数据写到文件中(一行一条) + * @param filePath + * @param collection + */ + public static void writePictInfoCollectionToFile(String filePath, Collection collection) { + + if (null != collection && !collection.isEmpty()) { + FileWriter fw = null; + BufferedWriter out = null; + try { + fw = new FileWriter(new File(filePath), true); + out = new BufferedWriter(fw); + for (PictInfo pictInfo : collection) { + String str = pictInfo.getSite() + "#####" + pictInfo.getTitle() + "#####" + pictInfo.getUrl(); + out.write(str += "\r\n"); + } + out.flush(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } finally { + closeIO(null, null, fw, out); + } + } + + } + + /** + * 把字符串写入文件(一行一条) + * @param filePath + * @param str + * @param append 是否追加 + */ + public static void writeStringToFile(String filePath, String str, boolean append) { + + FileWriter fw = null; + BufferedWriter out = null; + try { + fw = new FileWriter(new File(filePath), append); + out = new BufferedWriter(fw); + if (!(StringUtils.isEmpty(str) && !append)) { //如果字符串为空并且不追加,则只写入空的字符串(不带换行符) + str += "\r\n"; + } + out.write(str); + out.flush(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } finally { + closeIO(null, null, fw, out); + } + + } + + private static void closeIO(InputStreamReader in, BufferedReader br, FileWriter fw, BufferedWriter out) { + try { + if (br != null) { + br.close(); + } + if (in != null) { + in.close(); + } + if (out != null) { + out.close(); + } + if (fw != null) { + fw.close(); + } + } catch (IOException e) { + e.printStackTrace(); + Utils.error(e.getMessage()); + } + } + + /** + * 获取域名(无多余的后缀) + * @param context + * @return + */ + public static String getSite(String context) { + + String site = ""; + if (context.startsWith("http")) { + int start = 7; + String _start = "http://"; + if (context.startsWith("https")) { + start = 8; + _start = "https://"; + } + site = context.substring(start); + if (site.indexOf("/") != -1) { + site = _start + site.substring(0, site.indexOf("/")); + } else { + site = _start + site; + } + } else { + if (context.indexOf("/") != -1) { + site = context.substring(0, context.indexOf("/")); + } else { + site = context; + } + } + + return site; + + } + + /** + * 获取完整地址 + * @param site 无多余后缀的网站地址 + * @param path 有后缀的地址 + * @param url 抓取的地址 + * @return 如果抓取的地址开头是/,说明是从根目录开始。如果开头为..则表示从有后缀的地址往前回退 + */ + public static String getFullPath(String site, String path, String url) { + + if (url.startsWith("//")) { + if (site.startsWith("https")) { + url = "https:" + url; + } else { + url = "http:" +url; + } + } else if (url.startsWith("/")) { + url = site + url; + } else if (url.startsWith("../")) { + url = path + url; + } else if (url.startsWith("./") || (url.indexOf("/") == -1)) { + url = path.substring(0, (path.lastIndexOf("/") + 1)) + url; + } + + return url; + + } + + /** + * 检查url是否在网站域名内,防止超出边界 + * @param url + * @return + */ + public static Boolean checkSite(String url) { + Boolean result = false; + for (String s : Main.transboundarySites) {//先检查是否在可越界域名内 + s = getShortSite(s); + if (url.indexOf(s) != -1) { + result = true; + break; + } + } + if (!result) { + for (String s : Main.sites) { + s = getShortSite(s); + if (url.indexOf(s) != -1) { + result = true; + break; + } + } + } + if (!result) { + info("扫描到的url=" + url + "越界"); + } + return result; + } + + /** + * 获取网站短域名(不包含www及http前缀) + * @param site + * @return + */ + public static String getShortSite(String site) { + if (site.startsWith("http")) { + int start = 7; + if (site.startsWith("https")) { + start = 8; + } + site = site.substring(start); + } + return site.substring(site.indexOf(".")+1); + } + + /** + * 检查url是否是域名地址 + * @param url + * @return + */ + public static boolean isSite(String url) { + boolean result = false; + url = removeHttpOrHttps(url); + if (url.substring(url.length() - 1, url.length()).equals("/")) { + url = url.substring(0, url.length() - 1); + } + for (String site : Main.sites) { + if (url.equalsIgnoreCase(removeHttpOrHttps(site))) { + result = true; + break; + } + } + return result; + } + + /** + * 去除http://或https:// + * @param url + * @return + */ + public static String removeHttpOrHttps(String url) { + if (url.startsWith("http")) { + int start = 7; + if (url.startsWith("https")) { + start = 8; + } + url = url.substring(start); + } + return url; + } + + /** + * 获取可做为文件名的网站短域名(不包含www及http前缀,点用下划线替换) + * @param site + * @return + */ + public static String getUseFileNameShortSite(String site) { + return getShortSite(site).replaceAll("\\.", "_"); + } + + /** + * 特殊符号去除 + * @param str + * @return + */ + public static String specialSymbolRemoval(String str) { + return str.replaceAll("[\\\\/:\\\\*\\\\?\\\\\"<>\\\\|]", ""); + } + + /** + * 检查text是否"下一页"按钮中的文字 + * @param text + * @return + */ + public static boolean isNextPageButton(String text) { + boolean result = false; + for (String nextPageName : Main.nextPageNames) { + if (text.equals(nextPageName)) { + result = true; + break; + } + } + return result; + } + + /** + * 创建图片文件 + * @param title + * @return + */ + public static File createImgFile(String site, String title, String name, String suffix) throws IOException { + File file = null; + if (null == Main.imgDownloadedDirName || Main.imgDownloadedDirName.isEmpty()) { + file = _createImgFile(site, title, name, suffix); + } else { + Double allowDifferentCharacters = Double.parseDouble(Main.allowDifferentCharacters); + String oldPath = null; + String newTitle = title.replaceAll("(\\[+)|(\\]+)|(\\s)|(\\.+)|(\\_+)|(\\-+)|(\\——+)|(\\-+)|(\\第\\d+\\页)|(\\第)|(\\页)", ""); + char[] newChar = newTitle.toCharArray(); + for (String s : Main.imgDownloadedDirName) { + String[] _s = s.split("#####"); + if (_s.length == 2) { + String oldTitle = _s[0].replaceAll("(\\[+)|(\\]+)|(\\s)|(\\.+)|(\\_+)|(\\-+)|(\\——+)|(\\-+)|(\\第\\d+\\页)|(\\第)|(\\页)", ""); + if (oldTitle.equalsIgnoreCase(newTitle)) {//名称一样 + oldPath = _s[1]; + break; + } + char[] oldChar = oldTitle.toCharArray(); + int difference = 0; + char[] maxLengthChar = null; + char[] minLengthChar = null; + if (oldChar.length >= newChar.length) { + maxLengthChar = oldChar; + minLengthChar = newChar; + } else { + maxLengthChar = newChar; + minLengthChar = oldChar; + } + if (allowDifferentCharacters < 1 && allowDifferentCharacters > 0) {//取相差百分比 + allowDifferentCharacters = maxLengthChar.length - (maxLengthChar.length * allowDifferentCharacters); + } + if (maxLengthChar.length != minLengthChar.length && + ((maxLengthChar.length - minLengthChar.length) > allowDifferentCharacters)) {//长度已经超过相差值 + continue; + } + for (int j=0; j