init project
This commit is contained in:
88
pom.xml
Normal file
88
pom.xml
Normal file
@ -0,0 +1,88 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.hitoli.fetchPic</groupId>
|
||||
<artifactId>fetchPic</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<java.version>1.8</java.version>
|
||||
|
||||
<maven-compiler.version>3.7.0</maven-compiler.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-codec</groupId>
|
||||
<artifactId>commons-codec</artifactId>
|
||||
<version>1.9</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-logging</groupId>
|
||||
<artifactId>commons-logging</artifactId>
|
||||
<version>1.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>fluent-hc</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient-cache</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient-win</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpcore</artifactId>
|
||||
<version>4.4.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.11.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.htmlunit</groupId>
|
||||
<artifactId>htmlunit</artifactId>
|
||||
<version>2.32</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.7</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>${maven-compiler.version}</version>
|
||||
<configuration>
|
||||
<source>${java.version}</source>
|
||||
<target>${java.version}</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
</project>
|
3
src/main/java/META-INF/MANIFEST.MF
Normal file
3
src/main/java/META-INF/MANIFEST.MF
Normal file
@ -0,0 +1,3 @@
|
||||
Manifest-Version: 1.0
|
||||
Main-Class: com.hitoli.fetchPic.Main
|
||||
|
166
src/main/java/com/hitoli/fetchPic/DownLoad.java
Normal file
166
src/main/java/com/hitoli/fetchPic/DownLoad.java
Normal file
@ -0,0 +1,166 @@
|
||||
package com.hitoli.fetchPic;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.HttpEntity;
|
||||
import org.apache.http.client.config.RequestConfig;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
public class DownLoad {
|
||||
|
||||
public static RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setSocketTimeout(5000)
|
||||
.setConnectTimeout(5000)
|
||||
.setConnectionRequestTimeout(5000)
|
||||
.setStaleConnectionCheckEnabled(true)
|
||||
.setRedirectsEnabled(true)
|
||||
.setMaxRedirects(3)
|
||||
.build();
|
||||
|
||||
public static CloseableHttpClient httpClient = HttpClients.custom().
|
||||
setDefaultRequestConfig(defaultRequestConfig).build();
|
||||
|
||||
public static void downloadPict(PictInfo pictInfo) {
|
||||
|
||||
String url = pictInfo.getUrl();
|
||||
CloseableHttpResponse response = null;
|
||||
OutputStream out = null;
|
||||
InputStream in=null;
|
||||
BufferedReader br=null;
|
||||
byte buffer[] = new byte[1024];
|
||||
if(StringUtils.isNotEmpty(url)){
|
||||
try {
|
||||
String suffix = url.substring(url.lastIndexOf("."));
|
||||
String temp = suffix.substring(1, suffix.length()).toUpperCase();
|
||||
|
||||
if (!(temp.equals("BMP") || temp.equals("JPG") || temp.equals("JPEG") || temp.equals("GIF") ||
|
||||
temp.equals("PNG") || temp.equals("WEBP"))) { //非图片的丢弃
|
||||
return;
|
||||
}
|
||||
String name = url.substring(url.lastIndexOf("/")+1, url.lastIndexOf("."));
|
||||
if (StringUtils.isEmpty(name)) {
|
||||
name = String.valueOf(System.currentTimeMillis());
|
||||
} else {
|
||||
name = Utils.specialSymbolRemoval(name + "_" + System.currentTimeMillis());
|
||||
}
|
||||
HttpGet httpGet = new HttpGet(url);
|
||||
httpGet.setConfig(defaultRequestConfig);
|
||||
if (StringUtils.isNotEmpty(Main.refererUrl)) {
|
||||
if (Main.refererUrl.equalsIgnoreCase("self")) {
|
||||
httpGet.setHeader("referer", pictInfo.getHtmlUrl());
|
||||
} else {
|
||||
httpGet.setHeader("referer", Main.refererUrl);
|
||||
}
|
||||
}
|
||||
response = httpClient.execute(httpGet);
|
||||
HttpEntity entity = response.getEntity();
|
||||
long imgSize = entity.getContentLength();
|
||||
if (imgSize < Main.imgMinSize*1024) { //默认图片小于1KB的丢弃
|
||||
throw new Exception("图片只有" + (imgSize/1024) + "KB,小于" + Main.imgMinSize + "KB");
|
||||
}
|
||||
in = entity.getContent();
|
||||
String title = pictInfo.getTitle();
|
||||
if (StringUtils.isEmpty(title)) {
|
||||
title = "other";
|
||||
} else {
|
||||
title = Utils.specialSymbolRemoval(title);
|
||||
}
|
||||
File file = Utils.createImgFile(Utils.getUseFileNameShortSite(pictInfo.getSite()), title, name, suffix);
|
||||
Utils.info("正在下载:" + url);
|
||||
out = new FileOutputStream(file);
|
||||
int index = 0;
|
||||
while((index = in.read(buffer)) != -1){
|
||||
out.write(buffer,0,index);
|
||||
}
|
||||
out.flush();
|
||||
|
||||
Main.imgDownloaded.add(url);
|
||||
} catch (Exception e) {
|
||||
Utils.error("下载失败:" + url + " [" + e.getMessage() + "]");
|
||||
if (e instanceof SocketTimeoutException) {
|
||||
Main.readTimeOutImgs.add(pictInfo);
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
if (br != null){
|
||||
br.close();
|
||||
}
|
||||
if (out != null){
|
||||
out.close();
|
||||
}
|
||||
if (in != null){
|
||||
in.close();
|
||||
}
|
||||
if (response != null) {
|
||||
response.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void downloadPict(List<PictInfo> pictLinks){
|
||||
|
||||
if (null != pictLinks && !pictLinks.isEmpty()) {
|
||||
if (Main.thread) {
|
||||
for (int i=0; i<pictLinks.size(); i++) {
|
||||
PictInfo pictInfo = pictLinks.get(i);
|
||||
if (pictInfo != null && !Main.imgDownloaded.contains(pictInfo.getUrl())) {
|
||||
new Thread(new DownloadPictRunnable(pictInfo)).start();
|
||||
}
|
||||
if ((i+1)%Main.threadSize == 0) {
|
||||
try {
|
||||
Thread.sleep(Main.threadSleep*1000);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (PictInfo pictInfo : pictLinks) {
|
||||
if (pictInfo != null && !Main.imgDownloaded.contains(pictInfo.getUrl())) {
|
||||
downloadPict(pictInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pictLinks.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DownloadPictRunnable implements Runnable {
|
||||
|
||||
private PictInfo pictInfo;
|
||||
|
||||
public DownloadPictRunnable(PictInfo pictInfo) {
|
||||
this.pictInfo = pictInfo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
DownLoad.downloadPict(pictInfo);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
255
src/main/java/com/hitoli/fetchPic/FindLink.java
Normal file
255
src/main/java/com/hitoli/fetchPic/FindLink.java
Normal file
@ -0,0 +1,255 @@
|
||||
package com.hitoli.fetchPic;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
|
||||
public class FindLink {
|
||||
|
||||
/**
|
||||
* 找出url下的所有a标签连接
|
||||
* @param url
|
||||
* @param pageLinks
|
||||
* @param lazyPageLinks 非图片链接延迟扫描
|
||||
* @param connectExceptionRetryCount 重试次数
|
||||
*/
|
||||
public static boolean addPageLink(String url, List<String> pageLinks, List<String> lazyPageLinks, int connectExceptionRetryCount) {
|
||||
|
||||
boolean result = true;
|
||||
if (!Main.pageSet.contains(url)) { //已分析过的连接不再分析
|
||||
Utils.info("开始分析url[" + url + "]中的可用连接");
|
||||
try {
|
||||
Document document = HttpUtils.getInstance().getHtmlPageResponseAsDocument(url);
|
||||
String site = Utils.getSite(url);
|
||||
Elements elements = document.select("a");
|
||||
boolean findNext = false;//是否找到下一页
|
||||
Set<String> allUrls = new HashSet<String>();//当前访问url下的所有链接地址
|
||||
for (Element element : elements) {
|
||||
String href = element.attr("href");
|
||||
if (!"".equals(href) && !"#".equals(href) && href.indexOf(".js") == -1 && href.indexOf(".css") == -1
|
||||
&& href.indexOf("javascript") == -1) {
|
||||
href = Utils.getFullPath(site, url, href);
|
||||
if (StringUtils.isNotEmpty(Main.fixedUrlPrefix)) {
|
||||
if (!href.startsWith(Main.fixedUrlPrefix)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (href.equals(url) || Main.pageSet.contains(href)) {
|
||||
continue;
|
||||
}
|
||||
if (!Utils.checkSite(href)) {
|
||||
continue;
|
||||
}
|
||||
Elements imgs = element.select("img");
|
||||
boolean discard = false;
|
||||
for (Element img : imgs) {
|
||||
//A标签的href地址和图片地址一致,说明是图片地址,不用再对该地址进行分析
|
||||
if (Utils.getFullPath(site, url, img.attr("src")).equals(href)) {
|
||||
discard = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (discard) {
|
||||
continue;
|
||||
}
|
||||
allUrls.add(href);
|
||||
String text = element.text().trim();
|
||||
String title = element.attr("title");
|
||||
if (Utils.isNextPageButton(text) || Utils.isNextPageButton(title)) { //始终优先处理下一页的内容
|
||||
pageLinks.add(1, href);
|
||||
findNext = true;
|
||||
} else {
|
||||
//通过父元素的class判断url是否是头布局中的链接(头布局中的链接延迟扫描)
|
||||
boolean head = false;
|
||||
Elements parents = element.parents();
|
||||
for (Element parent : parents) {
|
||||
for (String className : parent.classNames()) {
|
||||
if (className.equalsIgnoreCase("head") ||
|
||||
className.equalsIgnoreCase("header") ||
|
||||
className.equalsIgnoreCase("logo")) {
|
||||
head = true;
|
||||
break;
|
||||
}
|
||||
if (head) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (head || imgs.isEmpty() || Utils.isSite(href)) { //非图片链接延迟扫描
|
||||
if (!lazyPageLinks.contains(href)) {
|
||||
lazyPageLinks.add(href);
|
||||
}
|
||||
} else {
|
||||
if (!pageLinks.contains(href)) {
|
||||
pageLinks.add(href);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* 通过比对url/到点之间的数字,大于当前页数字,并且小于所有获取的数字则是下一页
|
||||
* (
|
||||
* 如xxx/xxx_0.html,xxx/xxx_1.html,xxx/xxx_2.html,
|
||||
* 当前页是xxx/xxx_0.html
|
||||
* 获取xxx_0,xxx_1,xxx_2
|
||||
* 获取数字0,1,2
|
||||
* 比0大的数中最小的为下一页
|
||||
* )
|
||||
*/
|
||||
if (!findNext && !allUrls.isEmpty()) {
|
||||
String _urlStart = url.substring(0, url.lastIndexOf("/")+1);
|
||||
String _urlEnd = url.substring(url.lastIndexOf("/")+1, url.lastIndexOf("."));
|
||||
String urlNumberStr = "";
|
||||
for(int i=0; i<_urlEnd.length(); i++){
|
||||
if(_urlEnd.charAt(i) >= 48 && _urlEnd.charAt(i) <= 57) {
|
||||
urlNumberStr += _urlEnd.charAt(i);
|
||||
}
|
||||
}
|
||||
Long urlNumber = Long.valueOf(urlNumberStr);
|
||||
Long nextPageNumber = 0l;
|
||||
String nextPageUrl = "";
|
||||
for (String s : allUrls) {
|
||||
if (!url.equalsIgnoreCase(s) && s.indexOf(_urlStart) != -1) {
|
||||
String _sEnd = s.substring(s.lastIndexOf("/")+1, s.lastIndexOf("."));
|
||||
String _sNumberStr = "";
|
||||
for(int i=0; i<_sEnd.length(); i++) {
|
||||
if (_sEnd.charAt(i) >= 48 && _sEnd.charAt(i) <= 57) {
|
||||
_sNumberStr += _sEnd.charAt(i);
|
||||
}
|
||||
}
|
||||
Long _sNumber = Long.valueOf(_sNumberStr);
|
||||
if (nextPageUrl == "") {
|
||||
if (_sNumber.intValue() > urlNumber) {
|
||||
nextPageNumber = _sNumber;
|
||||
nextPageUrl = s;
|
||||
}
|
||||
} else {
|
||||
if (_sNumber < nextPageNumber) {
|
||||
nextPageNumber = _sNumber;
|
||||
nextPageUrl = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nextPageUrl != "") {
|
||||
pageLinks.add(1, nextPageUrl);
|
||||
findNext = true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (connectExceptionRetryCount <= 0) {
|
||||
Utils.error("无效地址:" + url);
|
||||
result = false;
|
||||
} else {
|
||||
Utils.error("重试访问地址:" + url + "第" + (5 - connectExceptionRetryCount + 1) + "次");
|
||||
connectExceptionRetryCount--;
|
||||
return addPageLink(url, pageLinks, lazyPageLinks, connectExceptionRetryCount);
|
||||
}
|
||||
}
|
||||
Utils.info("分析url[" + url + "]中的可用连接结束");
|
||||
Main.pageSet.add(url);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出url下的所有图片连接
|
||||
* @param url
|
||||
* @param pictInfos
|
||||
* @param nextPageUrl
|
||||
* @param connectExceptionRetryCount 重试次数
|
||||
*/
|
||||
public static boolean addPictLink(String url, List<PictInfo> pictInfos, String nextPageUrl, int connectExceptionRetryCount) {
|
||||
|
||||
boolean result = true;
|
||||
if (null == nextPageUrl) {
|
||||
nextPageUrl = "";
|
||||
}
|
||||
Utils.info("开始分析url[" + url + "]中的可用图片连接");
|
||||
try {
|
||||
Document document = HttpUtils.getInstance().getHtmlPageResponseAsDocument(url);
|
||||
String site = Utils.getSite(url);
|
||||
Elements elements = document.select("img");
|
||||
Element head = document.head();
|
||||
Elements titles = head.getElementsByTag("title");
|
||||
String title = "";
|
||||
if (null != titles && !titles.isEmpty()) {
|
||||
title = titles.get(0).text();
|
||||
if (StringUtils.isNotEmpty(title)) {
|
||||
if (null != Main.pageTitlefilters && !Main.pageTitlefilters.isEmpty()) {
|
||||
for (String filter : Main.pageTitlefilters) {
|
||||
title = title.replaceAll(filter, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (Element element : elements) {
|
||||
String imgAttrName = "src";
|
||||
if (null != Main.imgSrcRepletTags && !Main.imgSrcRepletTags.isEmpty()) {
|
||||
for (String tag : Main.imgSrcRepletTags) {
|
||||
if (element.hasAttr(tag)) {
|
||||
imgAttrName = tag;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
String src = element.attr(imgAttrName);
|
||||
if (StringUtils.isNotEmpty(src) && src.toUpperCase().indexOf("JAVASCRIPT") == -1) {
|
||||
if (!Main.imgNamefilters.isEmpty()) { //检查是否存在与要丢弃的图片名称中
|
||||
boolean discard = false;
|
||||
try {
|
||||
String temp = src.substring(src.lastIndexOf("/") + 1,
|
||||
src.lastIndexOf(".")).toUpperCase();
|
||||
for (String imgNamefilter : Main.imgNamefilters) {
|
||||
if (temp.equals(imgNamefilter)) {
|
||||
discard = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
if (discard) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
src = Utils.getFullPath(site, url, src);
|
||||
Element parent = element.parent();
|
||||
if (parent.tagName().toUpperCase().equals("A")) {
|
||||
//一般A标签下的img都是预览图片,但有些网站的a标签下的图片和a标签href地址一致,不是预览图片
|
||||
//有的图片a标签的href是下一页地址
|
||||
String parentUrl = parent.attr("href");
|
||||
String href = Utils.getFullPath(site, url, parentUrl);
|
||||
if (StringUtils.isNotEmpty(parentUrl) && !href.equals(src) && !nextPageUrl.equals(href)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!Main.imgDownloaded.contains(src)) {
|
||||
pictInfos.add(new PictInfo(site, title, src, url));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (connectExceptionRetryCount <= 0) {
|
||||
Utils.error("无效地址:" + url);
|
||||
result = false;
|
||||
} else {
|
||||
Utils.error("重试访问地址:" + url + "第" + (5 - connectExceptionRetryCount + 1) + "次");
|
||||
connectExceptionRetryCount--;
|
||||
addPictLink(url, pictInfos, nextPageUrl, connectExceptionRetryCount);
|
||||
}
|
||||
}
|
||||
Utils.info("分析url[" + url + "]中的可用图片连接结束");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
115
src/main/java/com/hitoli/fetchPic/HttpUtils.java
Normal file
115
src/main/java/com/hitoli/fetchPic/HttpUtils.java
Normal file
@ -0,0 +1,115 @@
|
||||
package com.hitoli.fetchPic;
|
||||
|
||||
import com.gargoylesoftware.htmlunit.BrowserVersion;
|
||||
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
|
||||
import com.gargoylesoftware.htmlunit.WebClient;
|
||||
import com.gargoylesoftware.htmlunit.html.HtmlPage;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
/**
|
||||
* <pre>
|
||||
* Http工具,包含:
|
||||
* 高级http工具(使用net.sourceforge.htmlunit获取完整的html页面,即完成后台js代码的运行)
|
||||
* </pre>
|
||||
*/
|
||||
public class HttpUtils {
|
||||
|
||||
private WebClient webClient;
|
||||
|
||||
/**
|
||||
* 等待异步JS执行时间
|
||||
*/
|
||||
private int waitForBackgroundJavaScript;
|
||||
|
||||
private static HttpUtils httpUtils;
|
||||
|
||||
private HttpUtils() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取实例
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static HttpUtils getInstance() {
|
||||
if (httpUtils == null) {
|
||||
httpUtils = new HttpUtils();
|
||||
}
|
||||
return httpUtils;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param browserTimeout 浏览器请求超时时间
|
||||
* @param jsTimeout js请求超时时间
|
||||
* @param jsEnabled 是否启用js
|
||||
* @param waitForBackgroundJavaScript 等待异步JS执行时间
|
||||
*/
|
||||
public void initWebClient(int browserTimeout, int jsTimeout, boolean jsEnabled, int waitForBackgroundJavaScript) {
|
||||
this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
|
||||
|
||||
webClient = new WebClient(BrowserVersion.CHROME);
|
||||
|
||||
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
|
||||
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
|
||||
webClient.getOptions().setActiveXNative(false);
|
||||
webClient.getOptions().setCssEnabled(false);//是否启用CSS
|
||||
|
||||
if (jsEnabled) {
|
||||
webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
|
||||
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
|
||||
} else {
|
||||
webClient.getOptions().setJavaScriptEnabled(false);
|
||||
}
|
||||
|
||||
webClient.getOptions().setTimeout(browserTimeout);//设置“浏览器”的请求超时时间
|
||||
webClient.setJavaScriptTimeout(jsTimeout);//设置JS执行的超时时间
|
||||
}
|
||||
|
||||
/**
|
||||
* 将网页返回为解析后的文档格式
|
||||
*
|
||||
* @param html
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
public static Document parseHtmlToDoc(String html) throws Exception {
|
||||
return removeHtmlSpace(html);
|
||||
}
|
||||
|
||||
private static Document removeHtmlSpace(String str) {
|
||||
Document doc = Jsoup.parse(str);
|
||||
String result = doc.html().replace(" ", "");
|
||||
return Jsoup.parse(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取页面文档字串(等待异步JS执行)
|
||||
*
|
||||
* @param url 页面URL
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
public String getHtmlPageResponse(String url) throws Exception {
|
||||
HtmlPage page;
|
||||
try {
|
||||
page = webClient.getPage(url);
|
||||
} catch (Exception e) {
|
||||
throw e;
|
||||
}
|
||||
webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程
|
||||
return page.asXml();
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取页面文档Document对象(等待异步JS执行)
|
||||
*
|
||||
* @param url 页面URL
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
public Document getHtmlPageResponseAsDocument(String url) throws Exception {
|
||||
return parseHtmlToDoc(getHtmlPageResponse(url));
|
||||
}
|
||||
}
|
417
src/main/java/com/hitoli/fetchPic/Main.java
Normal file
417
src/main/java/com/hitoli/fetchPic/Main.java
Normal file
@ -0,0 +1,417 @@
|
||||
package com.hitoli.fetchPic;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
|
||||
public class Main {
|
||||
|
||||
public static Set<String> sites = new HashSet<String>(); //所有网站根(防止访问地址越界)
|
||||
public static Set<String> transboundarySites = new HashSet<String>(); //可越界的网站根
|
||||
public static Set<String> pageSet = new HashSet<String>(); //所有已爬过的地址
|
||||
public static Set<PictInfo> readTimeOutImgs = new HashSet<PictInfo>(); //所有读取超时的图片
|
||||
public static Set<String> imgDownloaded = new HashSet<String>(); //已经下载的图片地址
|
||||
public static Set<String> imgDownloadedDirName = new HashSet<String>();//已经下载的图片目录名称
|
||||
public static String home = null;
|
||||
public static String downLoadDir = null; //下载目录
|
||||
public static String siteData = null; //网站可以扫描的域名地址记录文件(防止越界)
|
||||
public static String downLoadedImgData = null; //已经下载的图片地址记录文件
|
||||
public static String downloadedImgDirNameData = null; //已经下载的图片目录名称记录文件
|
||||
public static String allowDifferentCharacters = "0";//比对下载图片的title和已经下载的图片目录名称记录允许相差的字符数(用于检查图片存放路径的目录是否已经存在)
|
||||
public static String readTimeOutImgData = null; //所有读取超时的图片地址记录文件
|
||||
public static String allUrlData = null; //所有已爬过的地址记录文件
|
||||
public static String currentUrlData = null; //当前正在处理的地址记录文件
|
||||
public static String lazyUrlData = null; //延迟处理的地址记录文件
|
||||
public static String commandFile = null;//命令文件(每次循环pageLinks检查一次)
|
||||
public static long imgMinSize = 1; //下载图片最小单位(KB)
|
||||
public static List<String> nextPageNames = Arrays.stream(
|
||||
new String[]{"下一页", "下一篇", "下一章", "后", "NEXT", ">", ">>", ">>>"}
|
||||
).collect(Collectors.toList()); //下一页按钮中的text
|
||||
public static List<String> pageTitlefilters = Arrays.stream(
|
||||
new String[]{"(\\第\\d+\\页)"}
|
||||
).collect(Collectors.toList()); //title中需要过滤的关键字,过滤全部关键字
|
||||
public static List<String> imgNamefilters = Arrays.stream(
|
||||
new String[]{"LOGO", "FAVICON"}
|
||||
).collect(Collectors.toList()); //下载图片中需要丢弃的图片名称
|
||||
public static List<String> imgSrcRepletTags = Arrays.stream(
|
||||
new String[]{"original"}
|
||||
).collect(Collectors.toList()); //下载图片的链接非src标签,如果存在此list中的标签名,则获取对应标签的值为下载链接
|
||||
public static String refererUrl = "self"; //下载图片时需要模拟的来源url(如有的情况下),默认为self(发现图片的html地址)
|
||||
public static String fixedUrlPrefix = ""; //只扫描固定前缀的url
|
||||
public static int browserTimeout = 5000;//请求超时时间,默认5秒
|
||||
public static int jsTimeout = 1000;//请求超时时间,默认1秒
|
||||
public static int waitForBackgroundJavaScript = 1;//等待异步JS执行时间,默认1秒
|
||||
public static boolean jsEnabled = true;//是否启用js
|
||||
public static boolean thread = false; //是否多线程下载
|
||||
public static int threadSize = 5; //一次开启的线程数
|
||||
public static int threadSleep = 1; //开满线程数后等待多少秒
|
||||
public static boolean stop = false; //是否退出
|
||||
public static boolean autoSaveMemoryData = true; //自动保存内存数据到文件
|
||||
public static int autoSaveMemoryDataInterval = 3; //保存间隔(分钟)
|
||||
public static Long lastAutoSaveMemoryDataTime = null; //最后一次保存时间
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
HttpUtils.getInstance().initWebClient(browserTimeout, jsTimeout, jsEnabled, waitForBackgroundJavaScript);
|
||||
List<String> pageLinks = new ArrayList<String>();
|
||||
List<String> lazyPageLinks = new ArrayList<>();
|
||||
List<PictInfo> pictInfos = new ArrayList<PictInfo>();
|
||||
init(args, pageLinks, lazyPageLinks, pictInfos);
|
||||
|
||||
int networkExceptionRetryCount = 5;//连续访问url失败次数
|
||||
Set<String> failUrls = new HashSet<String>();
|
||||
while(pageLinks.size() > 0){
|
||||
int connectExceptionRetryCount = 5;//重复访问同一url失败次数
|
||||
|
||||
String url = pageLinks.get(0);
|
||||
if (StringUtils.isEmpty(url)) {
|
||||
pageLinks.remove(0);
|
||||
continue;
|
||||
}
|
||||
if (StringUtils.isNotEmpty(fixedUrlPrefix)) {
|
||||
if (!url.startsWith(fixedUrlPrefix)) {
|
||||
pageLinks.remove(0);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
//分析页面所有连接
|
||||
boolean result = FindLink.addPageLink(url, pageLinks, lazyPageLinks, connectExceptionRetryCount);
|
||||
//分析页面所有图片
|
||||
if (result) {
|
||||
networkExceptionRetryCount = 5;//访问成功,恢复次数
|
||||
String nextPageUrl = "";
|
||||
if (pageLinks.size() >= 2) {
|
||||
nextPageUrl = pageLinks.get(1);
|
||||
}
|
||||
result = FindLink.addPictLink(url, pictInfos, nextPageUrl, connectExceptionRetryCount);
|
||||
if (!result) {
|
||||
failUrls.add(url);
|
||||
networkExceptionRetryCount--;
|
||||
} else {
|
||||
networkExceptionRetryCount = 5;//访问成功,恢复次数
|
||||
}
|
||||
} else {
|
||||
failUrls.add(url);
|
||||
networkExceptionRetryCount--;
|
||||
}
|
||||
if (networkExceptionRetryCount <= 0) {
|
||||
Utils.error("网络可能出现问题,连续访问5次不同url失败");
|
||||
pageLinks.remove(0);
|
||||
pageLinks.addAll(0, failUrls);
|
||||
pageSet.removeAll(failUrls);
|
||||
writeMemoryDataToFile(pageLinks, lazyPageLinks);
|
||||
writeDefaultCommand();
|
||||
System.exit(0);
|
||||
}
|
||||
if (pictInfos.size() > 0) {
|
||||
Utils.info("url[" + url + "]中找到" + pictInfos.size() + "个可用图片连接");
|
||||
DownLoad.downloadPict(pictInfos);
|
||||
} else {
|
||||
Utils.info("url[" + url + "]中找到0可用图片连接");
|
||||
}
|
||||
pageLinks.remove(0);
|
||||
|
||||
//图片链接扫描完后取非图片链接继续扫描
|
||||
getLazyUrl(pageLinks, lazyPageLinks);
|
||||
|
||||
//检查命令文件,如果stop等于true或者pagelinks为空则停止程序
|
||||
checkAndExecutCommand(pageLinks, lazyPageLinks);
|
||||
|
||||
//检查是否自动保存内存数据
|
||||
checkAutoSaveMemoryData(pageLinks, lazyPageLinks);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void init(String[] prams, List<String> pageLinks, List<String> lazyPageLinks, List<PictInfo> pictInfos) {
|
||||
|
||||
if (thread) {
|
||||
sites = Collections.synchronizedSet(new HashSet<String>());
|
||||
pageSet = Collections.synchronizedSet(new HashSet<String>());
|
||||
imgDownloaded = Collections.synchronizedSet(new HashSet<String>());
|
||||
readTimeOutImgs = Collections.synchronizedSet(new HashSet<PictInfo>());
|
||||
pageLinks = Collections.synchronizedList(new ArrayList<String>());
|
||||
lazyPageLinks = Collections.synchronizedList(new ArrayList<String>());
|
||||
pictInfos = Collections.synchronizedList(new ArrayList<PictInfo>());
|
||||
}
|
||||
|
||||
//图片保存路径
|
||||
home = "";
|
||||
if (null != prams && prams.length > 0) {
|
||||
home = prams[0];
|
||||
}
|
||||
System.out.println("HOME路径为:" + home);
|
||||
if (!Utils.checkFilePathExists(home)) {
|
||||
System.out.println("HOME路径不存在,请手动创建");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
downLoadDir = home.endsWith(File.separator) ? (home + "pict") : (home + File.separator + "pict");
|
||||
//创建文件下载目录
|
||||
Utils.createDir(downLoadDir);
|
||||
siteData = downLoadDir + File.separator + "siteData";
|
||||
downLoadedImgData = downLoadDir + File.separator + "downLoadedImgData";
|
||||
downloadedImgDirNameData = downLoadDir + File.separator + "downloadedImgDirNameData";
|
||||
readTimeOutImgData = downLoadDir + File.separator + "readTimeOutImgData";
|
||||
allUrlData = downLoadDir + File.separator + "allUrlData";
|
||||
currentUrlData = downLoadDir + File.separator + "currentUrlData";
|
||||
lazyUrlData = downLoadDir + File.separator + "lazyUrlData";
|
||||
commandFile = downLoadDir + File.separator + "command";
|
||||
//创建网站域名地址记录文件(不存在才创建)
|
||||
Utils.createFile(siteData);
|
||||
//创建下载记录文件(不存在才创建)
|
||||
Utils.createFile(downLoadedImgData);
|
||||
//创建已经下载的图片目录名称文件(不存在才创建)
|
||||
Utils.createFile(downloadedImgDirNameData);
|
||||
//创建读取超时记录文件(不存在才创建)
|
||||
Utils.createFile(readTimeOutImgData);
|
||||
//创建所有已爬过的地址记录文件(不存在才创建)
|
||||
Utils.createFile(allUrlData);
|
||||
//创建当前需要处理的地址记录文件(不存在才创建)
|
||||
Utils.createFile(currentUrlData);
|
||||
//创建延迟处理的地址记录文件(不存在才创建)
|
||||
Utils.createFile(lazyUrlData);
|
||||
//创建命令文件(不存在才创建)
|
||||
Utils.createFile(commandFile);
|
||||
//写入初始命令
|
||||
writeDefaultCommand();
|
||||
//读取网站域名地址记录到内存中
|
||||
Utils.readFileDataToCollection(siteData, sites);
|
||||
//读取已经下载的文件记录到内存中
|
||||
Utils.readFileDataToCollection(downLoadedImgData, imgDownloaded);
|
||||
//读取已经下载的图片目录名称到内存中
|
||||
Utils.readFileDataToCollection(downloadedImgDirNameData, imgDownloadedDirName);
|
||||
//读取超时文件记录到内存中
|
||||
Utils.readFilePictInfoDataToCollection(readTimeOutImgData, readTimeOutImgs);
|
||||
//读取所有已爬过的地址记录到内存中
|
||||
Utils.readFileDataToCollection(allUrlData, pageSet);
|
||||
|
||||
//读取当前需要处理的连接地址记录到内存中
|
||||
Utils.readFileDataToCollection(currentUrlData, pageLinks);
|
||||
//读取延迟处理的连接地址记录到内存中
|
||||
Utils.readFileDataToCollection(lazyUrlData, lazyPageLinks);
|
||||
//图片链接扫描完后取非图片链接继续扫描
|
||||
getLazyUrl(pageLinks, lazyPageLinks);
|
||||
if (pageLinks.isEmpty()) {
|
||||
if (null == prams || prams.length < 2) {
|
||||
Utils.error("请输入抓取地址");
|
||||
System.exit(1);
|
||||
} else {
|
||||
pageLinks.add(prams[1]);
|
||||
sites.add(Utils.getSite(prams[1]));
|
||||
Utils.writeStringToFile(siteData, "", false);
|
||||
Utils.writeCollectionToFile(siteData, sites);
|
||||
}
|
||||
}
|
||||
|
||||
if (null != prams && prams.length >= 3) {
|
||||
Integer _imgMinSize = null;
|
||||
try {
|
||||
_imgMinSize = Integer.valueOf(prams[2]);
|
||||
} catch (Exception e) {
|
||||
Utils.error("抓取最小图片大小输入有误,必须大于等于0");
|
||||
}
|
||||
if (_imgMinSize == null || _imgMinSize < 0) {
|
||||
Utils.error("抓取最小图片大小输入有误,必须大于等于0");
|
||||
} else {
|
||||
imgMinSize = _imgMinSize;
|
||||
Utils.error("抓取最小图片大小为" + imgMinSize + "KB");
|
||||
}
|
||||
}
|
||||
|
||||
if (autoSaveMemoryData) { //首次启动把启动时间作为最后一次自动保存时间
|
||||
lastAutoSaveMemoryDataTime = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
if (!readTimeOutImgs.isEmpty()) {
|
||||
Utils.info("重新下载上次超时的图片");
|
||||
List<PictInfo> _pictInfos = readTimeOutImgs.stream().map(p -> {
|
||||
return new PictInfo(p.getSite(), p.getTitle(), p.getUrl(), p.getHtmlUrl());
|
||||
}).collect(Collectors.toList());
|
||||
readTimeOutImgs.clear();
|
||||
DownLoad.downloadPict(_pictInfos);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void getLazyUrl(List<String> pageLinks, List<String> lazyPageLinks) {
|
||||
//图片链接扫描完后取非图片链接继续扫描
|
||||
if (pageLinks.isEmpty() && !lazyPageLinks.isEmpty()) {
|
||||
int index = 0;
|
||||
String lazyUrl = lazyPageLinks.get(index);
|
||||
if (Utils.isSite(lazyUrl)) {
|
||||
if (lazyPageLinks.size() > 1) {
|
||||
index = 1;
|
||||
lazyUrl = lazyPageLinks.get(index);
|
||||
}
|
||||
}
|
||||
pageLinks.add(lazyUrl);
|
||||
lazyPageLinks.remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkAndExecutCommand(List<String> pageLinks, List<String> lazyPageLinks) {
|
||||
|
||||
readCommandToMemory("stop");
|
||||
|
||||
if (stop || pageLinks.isEmpty()) {
|
||||
if (pageLinks.isEmpty()) {
|
||||
Utils.info("无法找到新的url,抓取图片结束");
|
||||
}
|
||||
writeMemoryDataToFile(pageLinks, lazyPageLinks);
|
||||
writeDefaultCommand();
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void checkAutoSaveMemoryData(List<String> pageLinks, List<String> lazyPageLinks) {
|
||||
|
||||
if (autoSaveMemoryData) {
|
||||
if (null == lastAutoSaveMemoryDataTime) {
|
||||
writeMemoryDataToFile(pageLinks, lazyPageLinks);
|
||||
writeDefaultCommand();
|
||||
lastAutoSaveMemoryDataTime = System.currentTimeMillis();
|
||||
} else if (System.currentTimeMillis() > (lastAutoSaveMemoryDataTime + (autoSaveMemoryDataInterval*60*1000))) {
|
||||
writeMemoryDataToFile(pageLinks, lazyPageLinks);
|
||||
writeDefaultCommand();
|
||||
lastAutoSaveMemoryDataTime = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void writeDefaultCommand() {
|
||||
|
||||
readCommandToMemory();
|
||||
|
||||
Utils.writeStringToFile(commandFile, "", false);
|
||||
Utils.writeStringToFile(commandFile, "stop=false");
|
||||
Utils.writeStringToFile(commandFile, "imgMinSize=" + imgMinSize);
|
||||
Utils.writeStringToFile(commandFile, "nextPageNames=" + StringUtils.join(nextPageNames, ","));
|
||||
Utils.writeStringToFile(commandFile, "pageTitlefilters=" + StringUtils.join(pageTitlefilters, ","));
|
||||
Utils.writeStringToFile(commandFile, "imgNamefilters=" + StringUtils.join(imgNamefilters, ","));
|
||||
Utils.writeStringToFile(commandFile, "imgSrcRepletTags=" + StringUtils.join(imgSrcRepletTags, ","));
|
||||
Utils.writeStringToFile(commandFile, "allowDifferentCharacters=" + allowDifferentCharacters);
|
||||
Utils.writeStringToFile(commandFile, "browserTimeout=" + browserTimeout);
|
||||
Utils.writeStringToFile(commandFile, "jsTimeout=" + jsTimeout);
|
||||
Utils.writeStringToFile(commandFile, "jsEnabled=" + jsEnabled);
|
||||
Utils.writeStringToFile(commandFile, "waitForBackgroundJavaScript=" + waitForBackgroundJavaScript);
|
||||
Utils.writeStringToFile(commandFile, "refererUrl=" + refererUrl);
|
||||
Utils.writeStringToFile(commandFile, "transboundarySites=" + StringUtils.join(transboundarySites, ","));
|
||||
Utils.writeStringToFile(commandFile, "fixedUrlPrefix=" + fixedUrlPrefix);
|
||||
Utils.writeStringToFile(commandFile, "thread=" + thread);
|
||||
Utils.writeStringToFile(commandFile, "threadSize=" + threadSize);
|
||||
Utils.writeStringToFile(commandFile, "threadSleep=" + threadSleep);
|
||||
Utils.writeStringToFile(commandFile, "autoSaveMemoryData=" + autoSaveMemoryData);
|
||||
Utils.writeStringToFile(commandFile, "autoSaveMemoryDataInterval=" + autoSaveMemoryDataInterval);
|
||||
|
||||
}
|
||||
|
||||
private static void readCommandToMemory() {
|
||||
readCommandToMemory(Collections.EMPTY_LIST);
|
||||
}
|
||||
|
||||
private static void readCommandToMemory(String key) {
|
||||
readCommandToMemory(Arrays.asList(new String[] {key}));
|
||||
}
|
||||
|
||||
private static void readCommandToMemory(List<String> keys) {
|
||||
|
||||
Set<String> command = new HashSet<String>(); //命令列表
|
||||
Utils.readFileDataToCollection(commandFile, command);
|
||||
for (String c : command) {
|
||||
String[] _c = c.split("=");
|
||||
if (_c.length != 2) {
|
||||
continue;
|
||||
}
|
||||
if (null != keys && !keys.isEmpty()) {
|
||||
boolean exists = false;
|
||||
for (String key : keys) {
|
||||
if (_c[0].equals(key)) {
|
||||
exists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!exists) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
try {
|
||||
if (_c[0].equals("stop")) {
|
||||
stop = Boolean.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("imgMinSize")) {
|
||||
imgMinSize = Long.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("nextPageNames")) {
|
||||
nextPageNames = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
|
||||
} else if (_c[0].equals("pageTitlefilters")) {
|
||||
pageTitlefilters = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
|
||||
} else if (_c[0].equals("imgNamefilters")) {
|
||||
imgNamefilters = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
|
||||
} else if (_c[0].equals("imgSrcRepletTags")) {
|
||||
imgSrcRepletTags = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
|
||||
} else if (_c[0].equals("allowDifferentCharacters")) {
|
||||
allowDifferentCharacters = String.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("browserTimeout")) {
|
||||
browserTimeout = Integer.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("jsTimeout")) {
|
||||
jsTimeout = Integer.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("jsEnabled")) {
|
||||
jsEnabled = Boolean.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("waitForBackgroundJavaScript")) {
|
||||
waitForBackgroundJavaScript = Integer.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("refererUrl")) {
|
||||
refererUrl = _c[1];
|
||||
} else if (_c[0].equals("transboundarySites")) {
|
||||
transboundarySites = Arrays.stream(_c[1].split(",")).collect(Collectors.toSet());
|
||||
} else if (_c[0].equals("fixedUrlPrefix")) {
|
||||
fixedUrlPrefix = _c[1];
|
||||
} else if (_c[0].equals("thread")) {
|
||||
thread = Boolean.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("threadSize")) {
|
||||
threadSize = Integer.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("threadSleep")) {
|
||||
threadSleep = Integer.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("autoSaveMemoryData")) {
|
||||
autoSaveMemoryData = Boolean.valueOf(_c[1]);
|
||||
} else if (_c[0].equals("autoSaveMemoryDataInterval")) {
|
||||
autoSaveMemoryDataInterval = Integer.valueOf(_c[1]);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
Utils.error("参数错误" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void writeMemoryDataToFile(List<String> pageLinks, List<String> lazyPageLinks) {
|
||||
Utils.info("正在保存重要的分析数据,请不要强制退出");
|
||||
//先用一条空数据把记录覆盖,再用新数据写入到记录文件中
|
||||
Utils.writeStringToFile(siteData, "", false);
|
||||
Utils.writeCollectionToFile(siteData, sites);
|
||||
Utils.writeStringToFile(allUrlData, "", false);
|
||||
Utils.writeCollectionToFile(allUrlData, pageSet);
|
||||
Utils.writeStringToFile(currentUrlData, "", false);
|
||||
Utils.writeCollectionToFile(currentUrlData, pageLinks);
|
||||
Utils.writeStringToFile(lazyUrlData, "", false);
|
||||
Utils.writeCollectionToFile(lazyUrlData, lazyPageLinks);
|
||||
Utils.writeStringToFile(downLoadedImgData, "", false);
|
||||
Utils.writeCollectionToFile(downLoadedImgData, imgDownloaded);
|
||||
Utils.writeStringToFile(downloadedImgDirNameData, "", false);
|
||||
Utils.writeCollectionToFile(downloadedImgDirNameData, imgDownloadedDirName);
|
||||
Utils.writeStringToFile(readTimeOutImgData, "", false);
|
||||
Utils.writePictInfoCollectionToFile(readTimeOutImgData, readTimeOutImgs);
|
||||
}
|
||||
|
||||
}
|
||||
|
47
src/main/java/com/hitoli/fetchPic/PictInfo.java
Normal file
47
src/main/java/com/hitoli/fetchPic/PictInfo.java
Normal file
@ -0,0 +1,47 @@
|
||||
package com.hitoli.fetchPic;
|
||||
|
||||
public class PictInfo {
|
||||
private String site;
|
||||
private String url;
|
||||
private String title;
|
||||
private String htmlUrl;
|
||||
|
||||
public PictInfo(String site, String title, String url, String htmlUrl) {
|
||||
this.site = null == site ? "" : site;
|
||||
this.title = null == title ? "" : title;
|
||||
this.url = null == url ? "" : url;
|
||||
this.htmlUrl = null == htmlUrl ? "" : htmlUrl;
|
||||
}
|
||||
|
||||
public String getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
public void setSite(String site) {
|
||||
this.site = site;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getHtmlUrl() {
|
||||
return htmlUrl;
|
||||
}
|
||||
|
||||
public void setHtmlUrl(String htmlUrl) {
|
||||
this.htmlUrl = htmlUrl;
|
||||
}
|
||||
}
|
622
src/main/java/com/hitoli/fetchPic/Utils.java
Normal file
622
src/main/java/com/hitoli/fetchPic/Utils.java
Normal file
@ -0,0 +1,622 @@
|
||||
package com.hitoli.fetchPic;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.logging.FileHandler;
|
||||
import java.util.logging.Formatter;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.LogRecord;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class Utils {
|
||||
|
||||
private static Logger loginfo = null;
|
||||
private static Logger logerror = null;
|
||||
|
||||
static class MyLogHander extends Formatter {
|
||||
@Override
|
||||
public String format(LogRecord record) {
|
||||
return LocalDateTime.ofEpochSecond(record.getMillis()/1000, 0, ZoneOffset.ofHours(8)) +
|
||||
" " + record.getLevel() + " : " + record.getMessage()+"\n";
|
||||
}
|
||||
}
|
||||
|
||||
private static void createLogger() {
|
||||
loginfo = Logger.getLogger("fetchPicLog-info");
|
||||
loginfo.setLevel(Level.ALL);
|
||||
logerror = Logger.getLogger("fetchPicLog-error");
|
||||
logerror.setLevel(Level.WARNING);
|
||||
// ConsoleHandler consoleHandler = new ConsoleHandler();
|
||||
// consoleHandler.setLevel(Level.ALL);
|
||||
// loginfo.addHandler(consoleHandler);
|
||||
// logerror.addHandler(consoleHandler);
|
||||
FileHandler logInfoFileHandler = null;
|
||||
FileHandler logErrorFileHandler = null;
|
||||
try {
|
||||
logInfoFileHandler = new FileHandler(Main.downLoadDir + File.separator + "fetchPicLog-info.log");
|
||||
logErrorFileHandler = new FileHandler(Main.downLoadDir + File.separator + "fetchPicLog-error.log");
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
if (null != logInfoFileHandler) {
|
||||
logInfoFileHandler.setLevel(Level.INFO);
|
||||
logInfoFileHandler.setFormatter(new MyLogHander());
|
||||
loginfo.addHandler(logInfoFileHandler);
|
||||
} else {
|
||||
System.out.println("创建信息日志文件失败");
|
||||
}
|
||||
if (null != logErrorFileHandler) {
|
||||
logErrorFileHandler.setLevel(Level.WARNING);
|
||||
logErrorFileHandler.setFormatter(new MyLogHander());
|
||||
logerror.addHandler(logErrorFileHandler);
|
||||
} else {
|
||||
System.out.println("创建错误日志文件失败");
|
||||
}
|
||||
}
|
||||
|
||||
public static void info(String msg) {
|
||||
if (null == loginfo) {
|
||||
createLogger();
|
||||
}
|
||||
loginfo.info(msg);
|
||||
}
|
||||
|
||||
public static void error(String msg) {
|
||||
if (null == logerror) {
|
||||
createLogger();
|
||||
}
|
||||
logerror.warning(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建目录
|
||||
* @param dir
|
||||
*/
|
||||
public static void createDir(String dir){
|
||||
|
||||
File file = new File(dir);
|
||||
if(!file.exists()){
|
||||
file.mkdir();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建新的图片目录
|
||||
* @param dir
|
||||
*/
|
||||
public static void createNewImgDir(String dir){
|
||||
|
||||
File file = new File(dir);
|
||||
if(!file.exists()){
|
||||
file.mkdir();
|
||||
Main.imgDownloadedDirName.add(file.getName() + "#####" + file.getPath());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取目录下的目录
|
||||
* @param dir
|
||||
* @return
|
||||
*/
|
||||
public static List<File> getDirs(String dir) {
|
||||
File file = new File(dir);
|
||||
if(file.exists()){
|
||||
return Arrays.asList(file.listFiles()).stream().filter(f -> !f.isDirectory()).collect(Collectors.toList());
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除目录
|
||||
* @param dir
|
||||
*/
|
||||
public static void delDir(String dir){
|
||||
|
||||
File file = new File(dir);
|
||||
if(file.exists() && file.isDirectory()){
|
||||
for (File f : file.listFiles()) {
|
||||
delFile(f.getPath());
|
||||
}
|
||||
file.delete();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建文件
|
||||
* @param fielPath
|
||||
*/
|
||||
public static void createFile(String fielPath){
|
||||
|
||||
File file = new File(fielPath);
|
||||
if(!file.exists()){
|
||||
try {
|
||||
file.createNewFile();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除文件
|
||||
* @param filePath
|
||||
*/
|
||||
public static void delFile(String filePath){
|
||||
|
||||
File file = new File(filePath);
|
||||
if(file.exists() && file.isFile()){
|
||||
file.delete();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查指定路径是否存在
|
||||
* @param filePath
|
||||
* @return
|
||||
*/
|
||||
public static boolean checkFilePathExists(String filePath) {
|
||||
return new File(filePath).exists();
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取文件信息到集合中(一行一条数据)
|
||||
* @param filePath
|
||||
* @param collection
|
||||
*/
|
||||
public static void readFileDataToCollection(String filePath, Collection<String> collection) {
|
||||
|
||||
InputStreamReader in = null;
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
in = new InputStreamReader(new FileInputStream(new File(filePath)));
|
||||
br = new BufferedReader(in);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (StringUtils.isNotEmpty(line)) {
|
||||
collection.add(line);
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} finally {
|
||||
closeIO(in, br, null, null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取文件图片信息对象到集合中(一行一条数据)
|
||||
* @param filePath
|
||||
* @param collection
|
||||
*/
|
||||
public static void readFilePictInfoDataToCollection(String filePath, Collection<PictInfo> collection) {
|
||||
|
||||
InputStreamReader in = null;
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
in = new InputStreamReader(new FileInputStream(new File(filePath)));
|
||||
br = new BufferedReader(in);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (StringUtils.isNotEmpty(line)) {
|
||||
String[] datas = line.split("#####");
|
||||
if (datas.length >= 3) {
|
||||
PictInfo pictInfo = new PictInfo(datas[0], datas[1], datas[2], (datas.length == 3 ? null : datas[3]));
|
||||
collection.add(pictInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} finally {
|
||||
closeIO(in, br, null, null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 把字符串写入文件(一行一条),默认追加写入
|
||||
* @param filePath
|
||||
* @param str
|
||||
*/
|
||||
public static void writeStringToFile(String filePath, String str) {
|
||||
writeStringToFile(filePath, str, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* 把集合中的数据写到文件中(一行一条)
|
||||
* @param filePath
|
||||
* @param collection
|
||||
*/
|
||||
public static void writeCollectionToFile(String filePath, Collection<String> collection) {
|
||||
|
||||
if (null != collection && !collection.isEmpty()) {
|
||||
FileWriter fw = null;
|
||||
BufferedWriter out = null;
|
||||
try {
|
||||
fw = new FileWriter(new File(filePath), true);
|
||||
out = new BufferedWriter(fw);
|
||||
for (String str : collection) {
|
||||
out.write(str += "\r\n");
|
||||
}
|
||||
out.flush();
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} finally {
|
||||
closeIO(null, null, fw, out);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 把集合中的图片对象数据写到文件中(一行一条)
|
||||
* @param filePath
|
||||
* @param collection
|
||||
*/
|
||||
public static void writePictInfoCollectionToFile(String filePath, Collection<PictInfo> collection) {
|
||||
|
||||
if (null != collection && !collection.isEmpty()) {
|
||||
FileWriter fw = null;
|
||||
BufferedWriter out = null;
|
||||
try {
|
||||
fw = new FileWriter(new File(filePath), true);
|
||||
out = new BufferedWriter(fw);
|
||||
for (PictInfo pictInfo : collection) {
|
||||
String str = pictInfo.getSite() + "#####" + pictInfo.getTitle() + "#####" + pictInfo.getUrl();
|
||||
out.write(str += "\r\n");
|
||||
}
|
||||
out.flush();
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} finally {
|
||||
closeIO(null, null, fw, out);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 把字符串写入文件(一行一条)
|
||||
* @param filePath
|
||||
* @param str
|
||||
* @param append 是否追加
|
||||
*/
|
||||
public static void writeStringToFile(String filePath, String str, boolean append) {
|
||||
|
||||
FileWriter fw = null;
|
||||
BufferedWriter out = null;
|
||||
try {
|
||||
fw = new FileWriter(new File(filePath), append);
|
||||
out = new BufferedWriter(fw);
|
||||
if (!(StringUtils.isEmpty(str) && !append)) { //如果字符串为空并且不追加,则只写入空的字符串(不带换行符)
|
||||
str += "\r\n";
|
||||
}
|
||||
out.write(str);
|
||||
out.flush();
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
} finally {
|
||||
closeIO(null, null, fw, out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void closeIO(InputStreamReader in, BufferedReader br, FileWriter fw, BufferedWriter out) {
|
||||
try {
|
||||
if (br != null) {
|
||||
br.close();
|
||||
}
|
||||
if (in != null) {
|
||||
in.close();
|
||||
}
|
||||
if (out != null) {
|
||||
out.close();
|
||||
}
|
||||
if (fw != null) {
|
||||
fw.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Utils.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取域名(无多余的后缀)
|
||||
* @param context
|
||||
* @return
|
||||
*/
|
||||
public static String getSite(String context) {
|
||||
|
||||
String site = "";
|
||||
if (context.startsWith("http")) {
|
||||
int start = 7;
|
||||
String _start = "http://";
|
||||
if (context.startsWith("https")) {
|
||||
start = 8;
|
||||
_start = "https://";
|
||||
}
|
||||
site = context.substring(start);
|
||||
if (site.indexOf("/") != -1) {
|
||||
site = _start + site.substring(0, site.indexOf("/"));
|
||||
} else {
|
||||
site = _start + site;
|
||||
}
|
||||
} else {
|
||||
if (context.indexOf("/") != -1) {
|
||||
site = context.substring(0, context.indexOf("/"));
|
||||
} else {
|
||||
site = context;
|
||||
}
|
||||
}
|
||||
|
||||
return site;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取完整地址
|
||||
* @param site 无多余后缀的网站地址
|
||||
* @param path 有后缀的地址
|
||||
* @param url 抓取的地址
|
||||
* @return 如果抓取的地址开头是/,说明是从根目录开始。如果开头为..则表示从有后缀的地址往前回退
|
||||
*/
|
||||
public static String getFullPath(String site, String path, String url) {
|
||||
|
||||
if (url.startsWith("//")) {
|
||||
if (site.startsWith("https")) {
|
||||
url = "https:" + url;
|
||||
} else {
|
||||
url = "http:" +url;
|
||||
}
|
||||
} else if (url.startsWith("/")) {
|
||||
url = site + url;
|
||||
} else if (url.startsWith("../")) {
|
||||
url = path + url;
|
||||
} else if (url.startsWith("./") || (url.indexOf("/") == -1)) {
|
||||
url = path.substring(0, (path.lastIndexOf("/") + 1)) + url;
|
||||
}
|
||||
|
||||
return url;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查url是否在网站域名内,防止超出边界
|
||||
* @param url
|
||||
* @return
|
||||
*/
|
||||
public static Boolean checkSite(String url) {
|
||||
Boolean result = false;
|
||||
for (String s : Main.transboundarySites) {//先检查是否在可越界域名内
|
||||
s = getShortSite(s);
|
||||
if (url.indexOf(s) != -1) {
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!result) {
|
||||
for (String s : Main.sites) {
|
||||
s = getShortSite(s);
|
||||
if (url.indexOf(s) != -1) {
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!result) {
|
||||
info("扫描到的url=" + url + "越界");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取网站短域名(不包含www及http前缀)
|
||||
* @param site
|
||||
* @return
|
||||
*/
|
||||
public static String getShortSite(String site) {
|
||||
if (site.startsWith("http")) {
|
||||
int start = 7;
|
||||
if (site.startsWith("https")) {
|
||||
start = 8;
|
||||
}
|
||||
site = site.substring(start);
|
||||
}
|
||||
return site.substring(site.indexOf(".")+1);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查url是否是域名地址
|
||||
* @param url
|
||||
* @return
|
||||
*/
|
||||
public static boolean isSite(String url) {
|
||||
boolean result = false;
|
||||
url = removeHttpOrHttps(url);
|
||||
if (url.substring(url.length() - 1, url.length()).equals("/")) {
|
||||
url = url.substring(0, url.length() - 1);
|
||||
}
|
||||
for (String site : Main.sites) {
|
||||
if (url.equalsIgnoreCase(removeHttpOrHttps(site))) {
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 去除http://或https://
|
||||
* @param url
|
||||
* @return
|
||||
*/
|
||||
public static String removeHttpOrHttps(String url) {
|
||||
if (url.startsWith("http")) {
|
||||
int start = 7;
|
||||
if (url.startsWith("https")) {
|
||||
start = 8;
|
||||
}
|
||||
url = url.substring(start);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取可做为文件名的网站短域名(不包含www及http前缀,点用下划线替换)
|
||||
* @param site
|
||||
* @return
|
||||
*/
|
||||
public static String getUseFileNameShortSite(String site) {
|
||||
return getShortSite(site).replaceAll("\\.", "_");
|
||||
}
|
||||
|
||||
/**
|
||||
* 特殊符号去除
|
||||
* @param str
|
||||
* @return
|
||||
*/
|
||||
public static String specialSymbolRemoval(String str) {
|
||||
return str.replaceAll("[\\\\/:\\\\*\\\\?\\\\\"<>\\\\|]", "");
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查text是否"下一页"按钮中的文字
|
||||
* @param text
|
||||
* @return
|
||||
*/
|
||||
public static boolean isNextPageButton(String text) {
|
||||
boolean result = false;
|
||||
for (String nextPageName : Main.nextPageNames) {
|
||||
if (text.equals(nextPageName)) {
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建图片文件
|
||||
* @param title
|
||||
* @return
|
||||
*/
|
||||
public static File createImgFile(String site, String title, String name, String suffix) throws IOException {
|
||||
File file = null;
|
||||
if (null == Main.imgDownloadedDirName || Main.imgDownloadedDirName.isEmpty()) {
|
||||
file = _createImgFile(site, title, name, suffix);
|
||||
} else {
|
||||
Double allowDifferentCharacters = Double.parseDouble(Main.allowDifferentCharacters);
|
||||
String oldPath = null;
|
||||
String newTitle = title.replaceAll("(\\[+)|(\\]+)|(\\s)|(\\.+)|(\\_+)|(\\-+)|(\\——+)|(\\-+)|(\\第\\d+\\页)|(\\第)|(\\页)", "");
|
||||
char[] newChar = newTitle.toCharArray();
|
||||
for (String s : Main.imgDownloadedDirName) {
|
||||
String[] _s = s.split("#####");
|
||||
if (_s.length == 2) {
|
||||
String oldTitle = _s[0].replaceAll("(\\[+)|(\\]+)|(\\s)|(\\.+)|(\\_+)|(\\-+)|(\\——+)|(\\-+)|(\\第\\d+\\页)|(\\第)|(\\页)", "");
|
||||
if (oldTitle.equalsIgnoreCase(newTitle)) {//名称一样
|
||||
oldPath = _s[1];
|
||||
break;
|
||||
}
|
||||
char[] oldChar = oldTitle.toCharArray();
|
||||
int difference = 0;
|
||||
char[] maxLengthChar = null;
|
||||
char[] minLengthChar = null;
|
||||
if (oldChar.length >= newChar.length) {
|
||||
maxLengthChar = oldChar;
|
||||
minLengthChar = newChar;
|
||||
} else {
|
||||
maxLengthChar = newChar;
|
||||
minLengthChar = oldChar;
|
||||
}
|
||||
if (allowDifferentCharacters < 1 && allowDifferentCharacters > 0) {//取相差百分比
|
||||
allowDifferentCharacters = maxLengthChar.length - (maxLengthChar.length * allowDifferentCharacters);
|
||||
}
|
||||
if (maxLengthChar.length != minLengthChar.length &&
|
||||
((maxLengthChar.length - minLengthChar.length) > allowDifferentCharacters)) {//长度已经超过相差值
|
||||
continue;
|
||||
}
|
||||
for (int j=0; j<maxLengthChar.length; j++) {
|
||||
if (!String.valueOf(maxLengthChar[j]).equals(String.valueOf(minLengthChar[j]))) {
|
||||
difference++;
|
||||
}
|
||||
}
|
||||
|
||||
if (difference <= allowDifferentCharacters) {//判断是否在允许的不同字符数之内
|
||||
oldPath = _s[1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (StringUtils.isNotEmpty(oldPath)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (StringUtils.isNotEmpty(oldPath)) {
|
||||
createDir(oldPath);
|
||||
file = new File(oldPath + File.separator + name + suffix);
|
||||
} else {
|
||||
file = _createImgFile(site, title, name, suffix);
|
||||
}
|
||||
}
|
||||
if (null == file) {
|
||||
throw new IOException("创建图片文件失败");
|
||||
}
|
||||
return file;
|
||||
}
|
||||
|
||||
private static File _createImgFile(String site, String title, String name, String suffix) {
|
||||
if (!checkFilePathExists(Main.downLoadDir + File.separator + site + File.separator + title)) {
|
||||
createDir(Main.downLoadDir + File.separator + site);
|
||||
createNewImgDir(Main.downLoadDir + File.separator + site + File.separator + title);
|
||||
}
|
||||
return new File(Main.downLoadDir + File.separator + site
|
||||
+ File.separator + title + File.separator + name + suffix);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user