init project

This commit is contained in:
lihaitao
2022-09-10 00:10:10 +08:00
commit b68a6dd5b8
8 changed files with 1713 additions and 0 deletions

88
pom.xml Normal file
View File

@ -0,0 +1,88 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hitoli.fetchPic</groupId>
<artifactId>fetchPic</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<maven-compiler.version>3.7.0</maven-compiler.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.9</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient-cache</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient-win</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.32</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compiler.version}</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: com.hitoli.fetchPic.Main

View File

@ -0,0 +1,166 @@
package com.hitoli.fetchPic;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketTimeoutException;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class DownLoad {
public static RequestConfig defaultRequestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(5000)
.setConnectionRequestTimeout(5000)
.setStaleConnectionCheckEnabled(true)
.setRedirectsEnabled(true)
.setMaxRedirects(3)
.build();
public static CloseableHttpClient httpClient = HttpClients.custom().
setDefaultRequestConfig(defaultRequestConfig).build();
public static void downloadPict(PictInfo pictInfo) {
String url = pictInfo.getUrl();
CloseableHttpResponse response = null;
OutputStream out = null;
InputStream in=null;
BufferedReader br=null;
byte buffer[] = new byte[1024];
if(StringUtils.isNotEmpty(url)){
try {
String suffix = url.substring(url.lastIndexOf("."));
String temp = suffix.substring(1, suffix.length()).toUpperCase();
if (!(temp.equals("BMP") || temp.equals("JPG") || temp.equals("JPEG") || temp.equals("GIF") ||
temp.equals("PNG") || temp.equals("WEBP"))) { //非图片的丢弃
return;
}
String name = url.substring(url.lastIndexOf("/")+1, url.lastIndexOf("."));
if (StringUtils.isEmpty(name)) {
name = String.valueOf(System.currentTimeMillis());
} else {
name = Utils.specialSymbolRemoval(name + "_" + System.currentTimeMillis());
}
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(defaultRequestConfig);
if (StringUtils.isNotEmpty(Main.refererUrl)) {
if (Main.refererUrl.equalsIgnoreCase("self")) {
httpGet.setHeader("referer", pictInfo.getHtmlUrl());
} else {
httpGet.setHeader("referer", Main.refererUrl);
}
}
response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
long imgSize = entity.getContentLength();
if (imgSize < Main.imgMinSize*1024) { //默认图片小于1KB的丢弃
throw new Exception("图片只有" + (imgSize/1024) + "KB小于" + Main.imgMinSize + "KB");
}
in = entity.getContent();
String title = pictInfo.getTitle();
if (StringUtils.isEmpty(title)) {
title = "other";
} else {
title = Utils.specialSymbolRemoval(title);
}
File file = Utils.createImgFile(Utils.getUseFileNameShortSite(pictInfo.getSite()), title, name, suffix);
Utils.info("正在下载:" + url);
out = new FileOutputStream(file);
int index = 0;
while((index = in.read(buffer)) != -1){
out.write(buffer,0,index);
}
out.flush();
Main.imgDownloaded.add(url);
} catch (Exception e) {
Utils.error("下载失败:" + url + " [" + e.getMessage() + "]");
if (e instanceof SocketTimeoutException) {
Main.readTimeOutImgs.add(pictInfo);
}
} finally {
try {
if (br != null){
br.close();
}
if (out != null){
out.close();
}
if (in != null){
in.close();
}
if (response != null) {
response.close();
}
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
}
}
}
}
public static void downloadPict(List<PictInfo> pictLinks){
if (null != pictLinks && !pictLinks.isEmpty()) {
if (Main.thread) {
for (int i=0; i<pictLinks.size(); i++) {
PictInfo pictInfo = pictLinks.get(i);
if (pictInfo != null && !Main.imgDownloaded.contains(pictInfo.getUrl())) {
new Thread(new DownloadPictRunnable(pictInfo)).start();
}
if ((i+1)%Main.threadSize == 0) {
try {
Thread.sleep(Main.threadSleep*1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
} else {
for (PictInfo pictInfo : pictLinks) {
if (pictInfo != null && !Main.imgDownloaded.contains(pictInfo.getUrl())) {
downloadPict(pictInfo);
}
}
}
pictLinks.clear();
}
}
public static class DownloadPictRunnable implements Runnable {
private PictInfo pictInfo;
public DownloadPictRunnable(PictInfo pictInfo) {
this.pictInfo = pictInfo;
}
@Override
public void run() {
DownLoad.downloadPict(pictInfo);
}
}
}

View File

@ -0,0 +1,255 @@
package com.hitoli.fetchPic;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class FindLink {
/**
* 找出url下的所有a标签连接
* @param url
* @param pageLinks
* @param lazyPageLinks 非图片链接延迟扫描
* @param connectExceptionRetryCount 重试次数
*/
public static boolean addPageLink(String url, List<String> pageLinks, List<String> lazyPageLinks, int connectExceptionRetryCount) {
boolean result = true;
if (!Main.pageSet.contains(url)) { //已分析过的连接不再分析
Utils.info("开始分析url[" + url + "]中的可用连接");
try {
Document document = HttpUtils.getInstance().getHtmlPageResponseAsDocument(url);
String site = Utils.getSite(url);
Elements elements = document.select("a");
boolean findNext = false;//是否找到下一页
Set<String> allUrls = new HashSet<String>();//当前访问url下的所有链接地址
for (Element element : elements) {
String href = element.attr("href");
if (!"".equals(href) && !"#".equals(href) && href.indexOf(".js") == -1 && href.indexOf(".css") == -1
&& href.indexOf("javascript") == -1) {
href = Utils.getFullPath(site, url, href);
if (StringUtils.isNotEmpty(Main.fixedUrlPrefix)) {
if (!href.startsWith(Main.fixedUrlPrefix)) {
continue;
}
}
if (href.equals(url) || Main.pageSet.contains(href)) {
continue;
}
if (!Utils.checkSite(href)) {
continue;
}
Elements imgs = element.select("img");
boolean discard = false;
for (Element img : imgs) {
//A标签的href地址和图片地址一致说明是图片地址不用再对该地址进行分析
if (Utils.getFullPath(site, url, img.attr("src")).equals(href)) {
discard = true;
break;
}
}
if (discard) {
continue;
}
allUrls.add(href);
String text = element.text().trim();
String title = element.attr("title");
if (Utils.isNextPageButton(text) || Utils.isNextPageButton(title)) { //始终优先处理下一页的内容
pageLinks.add(1, href);
findNext = true;
} else {
//通过父元素的class判断url是否是头布局中的链接头布局中的链接延迟扫描
boolean head = false;
Elements parents = element.parents();
for (Element parent : parents) {
for (String className : parent.classNames()) {
if (className.equalsIgnoreCase("head") ||
className.equalsIgnoreCase("header") ||
className.equalsIgnoreCase("logo")) {
head = true;
break;
}
if (head) {
break;
}
}
}
if (head || imgs.isEmpty() || Utils.isSite(href)) { //非图片链接延迟扫描
if (!lazyPageLinks.contains(href)) {
lazyPageLinks.add(href);
}
} else {
if (!pageLinks.contains(href)) {
pageLinks.add(href);
}
}
}
}
}
/**
* 通过比对url/到点之间的数字,大于当前页数字,并且小于所有获取的数字则是下一页
* (
* 如xxx/xxx_0.html,xxx/xxx_1.html,xxx/xxx_2.html
* 当前页是xxx/xxx_0.html
* 获取xxx_0,xxx_1,xxx_2
* 获取数字012
* 比0大的数中最小的为下一页
* )
*/
if (!findNext && !allUrls.isEmpty()) {
String _urlStart = url.substring(0, url.lastIndexOf("/")+1);
String _urlEnd = url.substring(url.lastIndexOf("/")+1, url.lastIndexOf("."));
String urlNumberStr = "";
for(int i=0; i<_urlEnd.length(); i++){
if(_urlEnd.charAt(i) >= 48 && _urlEnd.charAt(i) <= 57) {
urlNumberStr += _urlEnd.charAt(i);
}
}
Long urlNumber = Long.valueOf(urlNumberStr);
Long nextPageNumber = 0l;
String nextPageUrl = "";
for (String s : allUrls) {
if (!url.equalsIgnoreCase(s) && s.indexOf(_urlStart) != -1) {
String _sEnd = s.substring(s.lastIndexOf("/")+1, s.lastIndexOf("."));
String _sNumberStr = "";
for(int i=0; i<_sEnd.length(); i++) {
if (_sEnd.charAt(i) >= 48 && _sEnd.charAt(i) <= 57) {
_sNumberStr += _sEnd.charAt(i);
}
}
Long _sNumber = Long.valueOf(_sNumberStr);
if (nextPageUrl == "") {
if (_sNumber.intValue() > urlNumber) {
nextPageNumber = _sNumber;
nextPageUrl = s;
}
} else {
if (_sNumber < nextPageNumber) {
nextPageNumber = _sNumber;
nextPageUrl = s;
}
}
}
}
if (nextPageUrl != "") {
pageLinks.add(1, nextPageUrl);
findNext = true;
}
}
} catch (Exception e) {
if (connectExceptionRetryCount <= 0) {
Utils.error("无效地址:" + url);
result = false;
} else {
Utils.error("重试访问地址:" + url + "" + (5 - connectExceptionRetryCount + 1) + "");
connectExceptionRetryCount--;
return addPageLink(url, pageLinks, lazyPageLinks, connectExceptionRetryCount);
}
}
Utils.info("分析url[" + url + "]中的可用连接结束");
Main.pageSet.add(url);
}
return result;
}
/**
* 找出url下的所有图片连接
* @param url
* @param pictInfos
* @param nextPageUrl
* @param connectExceptionRetryCount 重试次数
*/
public static boolean addPictLink(String url, List<PictInfo> pictInfos, String nextPageUrl, int connectExceptionRetryCount) {
boolean result = true;
if (null == nextPageUrl) {
nextPageUrl = "";
}
Utils.info("开始分析url[" + url + "]中的可用图片连接");
try {
Document document = HttpUtils.getInstance().getHtmlPageResponseAsDocument(url);
String site = Utils.getSite(url);
Elements elements = document.select("img");
Element head = document.head();
Elements titles = head.getElementsByTag("title");
String title = "";
if (null != titles && !titles.isEmpty()) {
title = titles.get(0).text();
if (StringUtils.isNotEmpty(title)) {
if (null != Main.pageTitlefilters && !Main.pageTitlefilters.isEmpty()) {
for (String filter : Main.pageTitlefilters) {
title = title.replaceAll(filter, "");
}
}
}
}
for (Element element : elements) {
String imgAttrName = "src";
if (null != Main.imgSrcRepletTags && !Main.imgSrcRepletTags.isEmpty()) {
for (String tag : Main.imgSrcRepletTags) {
if (element.hasAttr(tag)) {
imgAttrName = tag;
break;
}
}
}
String src = element.attr(imgAttrName);
if (StringUtils.isNotEmpty(src) && src.toUpperCase().indexOf("JAVASCRIPT") == -1) {
if (!Main.imgNamefilters.isEmpty()) { //检查是否存在与要丢弃的图片名称中
boolean discard = false;
try {
String temp = src.substring(src.lastIndexOf("/") + 1,
src.lastIndexOf(".")).toUpperCase();
for (String imgNamefilter : Main.imgNamefilters) {
if (temp.equals(imgNamefilter)) {
discard = true;
break;
}
}
} catch (Exception e) {
}
if (discard) {
continue;
}
}
src = Utils.getFullPath(site, url, src);
Element parent = element.parent();
if (parent.tagName().toUpperCase().equals("A")) {
//一般A标签下的img都是预览图片但有些网站的a标签下的图片和a标签href地址一致不是预览图片
//有的图片a标签的href是下一页地址
String parentUrl = parent.attr("href");
String href = Utils.getFullPath(site, url, parentUrl);
if (StringUtils.isNotEmpty(parentUrl) && !href.equals(src) && !nextPageUrl.equals(href)) {
continue;
}
}
if (!Main.imgDownloaded.contains(src)) {
pictInfos.add(new PictInfo(site, title, src, url));
}
}
}
} catch (Exception e) {
if (connectExceptionRetryCount <= 0) {
Utils.error("无效地址:" + url);
result = false;
} else {
Utils.error("重试访问地址:" + url + "" + (5 - connectExceptionRetryCount + 1) + "");
connectExceptionRetryCount--;
addPictLink(url, pictInfos, nextPageUrl, connectExceptionRetryCount);
}
}
Utils.info("分析url[" + url + "]中的可用图片连接结束");
return result;
}
}

View File

@ -0,0 +1,115 @@
package com.hitoli.fetchPic;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
* <pre>
* Http工具包含
* 高级http工具(使用net.sourceforge.htmlunit获取完整的html页面,即完成后台js代码的运行)
* </pre>
*/
public class HttpUtils {
private WebClient webClient;
/**
* 等待异步JS执行时间
*/
private int waitForBackgroundJavaScript;
private static HttpUtils httpUtils;
private HttpUtils() {
}
/**
* 获取实例
*
* @return
*/
public static HttpUtils getInstance() {
if (httpUtils == null) {
httpUtils = new HttpUtils();
}
return httpUtils;
}
/**
*
* @param browserTimeout 浏览器请求超时时间
* @param jsTimeout js请求超时时间
* @param jsEnabled 是否启用js
* @param waitForBackgroundJavaScript 等待异步JS执行时间
*/
public void initWebClient(int browserTimeout, int jsTimeout, boolean jsEnabled, int waitForBackgroundJavaScript) {
this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);//是否启用CSS
if (jsEnabled) {
webClient.getOptions().setJavaScriptEnabled(true); //很重要启用JS
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要设置支持AJAX
} else {
webClient.getOptions().setJavaScriptEnabled(false);
}
webClient.getOptions().setTimeout(browserTimeout);//设置“浏览器”的请求超时时间
webClient.setJavaScriptTimeout(jsTimeout);//设置JS执行的超时时间
}
/**
* 将网页返回为解析后的文档格式
*
* @param html
* @return
* @throws Exception
*/
public static Document parseHtmlToDoc(String html) throws Exception {
return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
Document doc = Jsoup.parse(str);
String result = doc.html().replace("&nbsp;", "");
return Jsoup.parse(result);
}
/**
* 获取页面文档字串(等待异步JS执行)
*
* @param url 页面URL
* @return
* @throws Exception
*/
public String getHtmlPageResponse(String url) throws Exception {
HtmlPage page;
try {
page = webClient.getPage(url);
} catch (Exception e) {
throw e;
}
webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程
return page.asXml();
}
/**
* 获取页面文档Document对象(等待异步JS执行)
*
* @param url 页面URL
* @return
* @throws Exception
*/
public Document getHtmlPageResponseAsDocument(String url) throws Exception {
return parseHtmlToDoc(getHtmlPageResponse(url));
}
}

View File

@ -0,0 +1,417 @@
package com.hitoli.fetchPic;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
public class Main {
public static Set<String> sites = new HashSet<String>(); //所有网站根(防止访问地址越界)
public static Set<String> transboundarySites = new HashSet<String>(); //可越界的网站根
public static Set<String> pageSet = new HashSet<String>(); //所有已爬过的地址
public static Set<PictInfo> readTimeOutImgs = new HashSet<PictInfo>(); //所有读取超时的图片
public static Set<String> imgDownloaded = new HashSet<String>(); //已经下载的图片地址
public static Set<String> imgDownloadedDirName = new HashSet<String>();//已经下载的图片目录名称
public static String home = null;
public static String downLoadDir = null; //下载目录
public static String siteData = null; //网站可以扫描的域名地址记录文件(防止越界)
public static String downLoadedImgData = null; //已经下载的图片地址记录文件
public static String downloadedImgDirNameData = null; //已经下载的图片目录名称记录文件
public static String allowDifferentCharacters = "0";//比对下载图片的title和已经下载的图片目录名称记录允许相差的字符数用于检查图片存放路径的目录是否已经存在
public static String readTimeOutImgData = null; //所有读取超时的图片地址记录文件
public static String allUrlData = null; //所有已爬过的地址记录文件
public static String currentUrlData = null; //当前正在处理的地址记录文件
public static String lazyUrlData = null; //延迟处理的地址记录文件
public static String commandFile = null;//命令文件每次循环pageLinks检查一次
public static long imgMinSize = 1; //下载图片最小单位KB
public static List<String> nextPageNames = Arrays.stream(
new String[]{"下一页", "下一篇", "下一章", "", "NEXT", ">", ">>", ">>>"}
).collect(Collectors.toList()); //下一页按钮中的text
public static List<String> pageTitlefilters = Arrays.stream(
new String[]{"(\\\\d+\\页)"}
).collect(Collectors.toList()); //title中需要过滤的关键字过滤全部关键字
public static List<String> imgNamefilters = Arrays.stream(
new String[]{"LOGO", "FAVICON"}
).collect(Collectors.toList()); //下载图片中需要丢弃的图片名称
public static List<String> imgSrcRepletTags = Arrays.stream(
new String[]{"original"}
).collect(Collectors.toList()); //下载图片的链接非src标签,如果存在此list中的标签名则获取对应标签的值为下载链接
public static String refererUrl = "self"; //下载图片时需要模拟的来源url如有的情况下,默认为self发现图片的html地址
public static String fixedUrlPrefix = ""; //只扫描固定前缀的url
public static int browserTimeout = 5000;//请求超时时间,默认5秒
public static int jsTimeout = 1000;//请求超时时间,默认1秒
public static int waitForBackgroundJavaScript = 1;//等待异步JS执行时间,默认1秒
public static boolean jsEnabled = true;//是否启用js
public static boolean thread = false; //是否多线程下载
public static int threadSize = 5; //一次开启的线程数
public static int threadSleep = 1; //开满线程数后等待多少秒
public static boolean stop = false; //是否退出
public static boolean autoSaveMemoryData = true; //自动保存内存数据到文件
public static int autoSaveMemoryDataInterval = 3; //保存间隔(分钟)
public static Long lastAutoSaveMemoryDataTime = null; //最后一次保存时间
public static void main(String[] args) {
HttpUtils.getInstance().initWebClient(browserTimeout, jsTimeout, jsEnabled, waitForBackgroundJavaScript);
List<String> pageLinks = new ArrayList<String>();
List<String> lazyPageLinks = new ArrayList<>();
List<PictInfo> pictInfos = new ArrayList<PictInfo>();
init(args, pageLinks, lazyPageLinks, pictInfos);
int networkExceptionRetryCount = 5;//连续访问url失败次数
Set<String> failUrls = new HashSet<String>();
while(pageLinks.size() > 0){
int connectExceptionRetryCount = 5;//重复访问同一url失败次数
String url = pageLinks.get(0);
if (StringUtils.isEmpty(url)) {
pageLinks.remove(0);
continue;
}
if (StringUtils.isNotEmpty(fixedUrlPrefix)) {
if (!url.startsWith(fixedUrlPrefix)) {
pageLinks.remove(0);
continue;
}
}
//分析页面所有连接
boolean result = FindLink.addPageLink(url, pageLinks, lazyPageLinks, connectExceptionRetryCount);
//分析页面所有图片
if (result) {
networkExceptionRetryCount = 5;//访问成功,恢复次数
String nextPageUrl = "";
if (pageLinks.size() >= 2) {
nextPageUrl = pageLinks.get(1);
}
result = FindLink.addPictLink(url, pictInfos, nextPageUrl, connectExceptionRetryCount);
if (!result) {
failUrls.add(url);
networkExceptionRetryCount--;
} else {
networkExceptionRetryCount = 5;//访问成功,恢复次数
}
} else {
failUrls.add(url);
networkExceptionRetryCount--;
}
if (networkExceptionRetryCount <= 0) {
Utils.error("网络可能出现问题连续访问5次不同url失败");
pageLinks.remove(0);
pageLinks.addAll(0, failUrls);
pageSet.removeAll(failUrls);
writeMemoryDataToFile(pageLinks, lazyPageLinks);
writeDefaultCommand();
System.exit(0);
}
if (pictInfos.size() > 0) {
Utils.info("url[" + url + "]中找到" + pictInfos.size() + "个可用图片连接");
DownLoad.downloadPict(pictInfos);
} else {
Utils.info("url[" + url + "]中找到0可用图片连接");
}
pageLinks.remove(0);
//图片链接扫描完后取非图片链接继续扫描
getLazyUrl(pageLinks, lazyPageLinks);
//检查命令文件如果stop等于true或者pagelinks为空则停止程序
checkAndExecutCommand(pageLinks, lazyPageLinks);
//检查是否自动保存内存数据
checkAutoSaveMemoryData(pageLinks, lazyPageLinks);
}
}
private static void init(String[] prams, List<String> pageLinks, List<String> lazyPageLinks, List<PictInfo> pictInfos) {
if (thread) {
sites = Collections.synchronizedSet(new HashSet<String>());
pageSet = Collections.synchronizedSet(new HashSet<String>());
imgDownloaded = Collections.synchronizedSet(new HashSet<String>());
readTimeOutImgs = Collections.synchronizedSet(new HashSet<PictInfo>());
pageLinks = Collections.synchronizedList(new ArrayList<String>());
lazyPageLinks = Collections.synchronizedList(new ArrayList<String>());
pictInfos = Collections.synchronizedList(new ArrayList<PictInfo>());
}
//图片保存路径
home = "";
if (null != prams && prams.length > 0) {
home = prams[0];
}
System.out.println("HOME路径为" + home);
if (!Utils.checkFilePathExists(home)) {
System.out.println("HOME路径不存在,请手动创建");
System.exit(1);
}
downLoadDir = home.endsWith(File.separator) ? (home + "pict") : (home + File.separator + "pict");
//创建文件下载目录
Utils.createDir(downLoadDir);
siteData = downLoadDir + File.separator + "siteData";
downLoadedImgData = downLoadDir + File.separator + "downLoadedImgData";
downloadedImgDirNameData = downLoadDir + File.separator + "downloadedImgDirNameData";
readTimeOutImgData = downLoadDir + File.separator + "readTimeOutImgData";
allUrlData = downLoadDir + File.separator + "allUrlData";
currentUrlData = downLoadDir + File.separator + "currentUrlData";
lazyUrlData = downLoadDir + File.separator + "lazyUrlData";
commandFile = downLoadDir + File.separator + "command";
//创建网站域名地址记录文件(不存在才创建)
Utils.createFile(siteData);
//创建下载记录文件(不存在才创建)
Utils.createFile(downLoadedImgData);
//创建已经下载的图片目录名称文件(不存在才创建)
Utils.createFile(downloadedImgDirNameData);
//创建读取超时记录文件(不存在才创建)
Utils.createFile(readTimeOutImgData);
//创建所有已爬过的地址记录文件(不存在才创建)
Utils.createFile(allUrlData);
//创建当前需要处理的地址记录文件(不存在才创建)
Utils.createFile(currentUrlData);
//创建延迟处理的地址记录文件(不存在才创建)
Utils.createFile(lazyUrlData);
//创建命令文件(不存在才创建)
Utils.createFile(commandFile);
//写入初始命令
writeDefaultCommand();
//读取网站域名地址记录到内存中
Utils.readFileDataToCollection(siteData, sites);
//读取已经下载的文件记录到内存中
Utils.readFileDataToCollection(downLoadedImgData, imgDownloaded);
//读取已经下载的图片目录名称到内存中
Utils.readFileDataToCollection(downloadedImgDirNameData, imgDownloadedDirName);
//读取超时文件记录到内存中
Utils.readFilePictInfoDataToCollection(readTimeOutImgData, readTimeOutImgs);
//读取所有已爬过的地址记录到内存中
Utils.readFileDataToCollection(allUrlData, pageSet);
//读取当前需要处理的连接地址记录到内存中
Utils.readFileDataToCollection(currentUrlData, pageLinks);
//读取延迟处理的连接地址记录到内存中
Utils.readFileDataToCollection(lazyUrlData, lazyPageLinks);
//图片链接扫描完后取非图片链接继续扫描
getLazyUrl(pageLinks, lazyPageLinks);
if (pageLinks.isEmpty()) {
if (null == prams || prams.length < 2) {
Utils.error("请输入抓取地址");
System.exit(1);
} else {
pageLinks.add(prams[1]);
sites.add(Utils.getSite(prams[1]));
Utils.writeStringToFile(siteData, "", false);
Utils.writeCollectionToFile(siteData, sites);
}
}
if (null != prams && prams.length >= 3) {
Integer _imgMinSize = null;
try {
_imgMinSize = Integer.valueOf(prams[2]);
} catch (Exception e) {
Utils.error("抓取最小图片大小输入有误必须大于等于0");
}
if (_imgMinSize == null || _imgMinSize < 0) {
Utils.error("抓取最小图片大小输入有误必须大于等于0");
} else {
imgMinSize = _imgMinSize;
Utils.error("抓取最小图片大小为" + imgMinSize + "KB");
}
}
if (autoSaveMemoryData) { //首次启动把启动时间作为最后一次自动保存时间
lastAutoSaveMemoryDataTime = System.currentTimeMillis();
}
if (!readTimeOutImgs.isEmpty()) {
Utils.info("重新下载上次超时的图片");
List<PictInfo> _pictInfos = readTimeOutImgs.stream().map(p -> {
return new PictInfo(p.getSite(), p.getTitle(), p.getUrl(), p.getHtmlUrl());
}).collect(Collectors.toList());
readTimeOutImgs.clear();
DownLoad.downloadPict(_pictInfos);
}
}
private static void getLazyUrl(List<String> pageLinks, List<String> lazyPageLinks) {
//图片链接扫描完后取非图片链接继续扫描
if (pageLinks.isEmpty() && !lazyPageLinks.isEmpty()) {
int index = 0;
String lazyUrl = lazyPageLinks.get(index);
if (Utils.isSite(lazyUrl)) {
if (lazyPageLinks.size() > 1) {
index = 1;
lazyUrl = lazyPageLinks.get(index);
}
}
pageLinks.add(lazyUrl);
lazyPageLinks.remove(index);
}
}
private static void checkAndExecutCommand(List<String> pageLinks, List<String> lazyPageLinks) {
readCommandToMemory("stop");
if (stop || pageLinks.isEmpty()) {
if (pageLinks.isEmpty()) {
Utils.info("无法找到新的url抓取图片结束");
}
writeMemoryDataToFile(pageLinks, lazyPageLinks);
writeDefaultCommand();
System.exit(0);
}
}
private static void checkAutoSaveMemoryData(List<String> pageLinks, List<String> lazyPageLinks) {
if (autoSaveMemoryData) {
if (null == lastAutoSaveMemoryDataTime) {
writeMemoryDataToFile(pageLinks, lazyPageLinks);
writeDefaultCommand();
lastAutoSaveMemoryDataTime = System.currentTimeMillis();
} else if (System.currentTimeMillis() > (lastAutoSaveMemoryDataTime + (autoSaveMemoryDataInterval*60*1000))) {
writeMemoryDataToFile(pageLinks, lazyPageLinks);
writeDefaultCommand();
lastAutoSaveMemoryDataTime = System.currentTimeMillis();
}
}
}
private static void writeDefaultCommand() {
readCommandToMemory();
Utils.writeStringToFile(commandFile, "", false);
Utils.writeStringToFile(commandFile, "stop=false");
Utils.writeStringToFile(commandFile, "imgMinSize=" + imgMinSize);
Utils.writeStringToFile(commandFile, "nextPageNames=" + StringUtils.join(nextPageNames, ","));
Utils.writeStringToFile(commandFile, "pageTitlefilters=" + StringUtils.join(pageTitlefilters, ","));
Utils.writeStringToFile(commandFile, "imgNamefilters=" + StringUtils.join(imgNamefilters, ","));
Utils.writeStringToFile(commandFile, "imgSrcRepletTags=" + StringUtils.join(imgSrcRepletTags, ","));
Utils.writeStringToFile(commandFile, "allowDifferentCharacters=" + allowDifferentCharacters);
Utils.writeStringToFile(commandFile, "browserTimeout=" + browserTimeout);
Utils.writeStringToFile(commandFile, "jsTimeout=" + jsTimeout);
Utils.writeStringToFile(commandFile, "jsEnabled=" + jsEnabled);
Utils.writeStringToFile(commandFile, "waitForBackgroundJavaScript=" + waitForBackgroundJavaScript);
Utils.writeStringToFile(commandFile, "refererUrl=" + refererUrl);
Utils.writeStringToFile(commandFile, "transboundarySites=" + StringUtils.join(transboundarySites, ","));
Utils.writeStringToFile(commandFile, "fixedUrlPrefix=" + fixedUrlPrefix);
Utils.writeStringToFile(commandFile, "thread=" + thread);
Utils.writeStringToFile(commandFile, "threadSize=" + threadSize);
Utils.writeStringToFile(commandFile, "threadSleep=" + threadSleep);
Utils.writeStringToFile(commandFile, "autoSaveMemoryData=" + autoSaveMemoryData);
Utils.writeStringToFile(commandFile, "autoSaveMemoryDataInterval=" + autoSaveMemoryDataInterval);
}
private static void readCommandToMemory() {
readCommandToMemory(Collections.EMPTY_LIST);
}
private static void readCommandToMemory(String key) {
readCommandToMemory(Arrays.asList(new String[] {key}));
}
private static void readCommandToMemory(List<String> keys) {
Set<String> command = new HashSet<String>(); //命令列表
Utils.readFileDataToCollection(commandFile, command);
for (String c : command) {
String[] _c = c.split("=");
if (_c.length != 2) {
continue;
}
if (null != keys && !keys.isEmpty()) {
boolean exists = false;
for (String key : keys) {
if (_c[0].equals(key)) {
exists = true;
break;
}
}
if (!exists) {
continue;
}
}
try {
if (_c[0].equals("stop")) {
stop = Boolean.valueOf(_c[1]);
} else if (_c[0].equals("imgMinSize")) {
imgMinSize = Long.valueOf(_c[1]);
} else if (_c[0].equals("nextPageNames")) {
nextPageNames = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
} else if (_c[0].equals("pageTitlefilters")) {
pageTitlefilters = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
} else if (_c[0].equals("imgNamefilters")) {
imgNamefilters = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
} else if (_c[0].equals("imgSrcRepletTags")) {
imgSrcRepletTags = Arrays.stream(_c[1].split(",")).collect(Collectors.toList());
} else if (_c[0].equals("allowDifferentCharacters")) {
allowDifferentCharacters = String.valueOf(_c[1]);
} else if (_c[0].equals("browserTimeout")) {
browserTimeout = Integer.valueOf(_c[1]);
} else if (_c[0].equals("jsTimeout")) {
jsTimeout = Integer.valueOf(_c[1]);
} else if (_c[0].equals("jsEnabled")) {
jsEnabled = Boolean.valueOf(_c[1]);
} else if (_c[0].equals("waitForBackgroundJavaScript")) {
waitForBackgroundJavaScript = Integer.valueOf(_c[1]);
} else if (_c[0].equals("refererUrl")) {
refererUrl = _c[1];
} else if (_c[0].equals("transboundarySites")) {
transboundarySites = Arrays.stream(_c[1].split(",")).collect(Collectors.toSet());
} else if (_c[0].equals("fixedUrlPrefix")) {
fixedUrlPrefix = _c[1];
} else if (_c[0].equals("thread")) {
thread = Boolean.valueOf(_c[1]);
} else if (_c[0].equals("threadSize")) {
threadSize = Integer.valueOf(_c[1]);
} else if (_c[0].equals("threadSleep")) {
threadSleep = Integer.valueOf(_c[1]);
} else if (_c[0].equals("autoSaveMemoryData")) {
autoSaveMemoryData = Boolean.valueOf(_c[1]);
} else if (_c[0].equals("autoSaveMemoryDataInterval")) {
autoSaveMemoryDataInterval = Integer.valueOf(_c[1]);
}
} catch (Exception e) {
e.printStackTrace();
Utils.error("参数错误" + e.getMessage());
}
}
}
private static void writeMemoryDataToFile(List<String> pageLinks, List<String> lazyPageLinks) {
Utils.info("正在保存重要的分析数据,请不要强制退出");
//先用一条空数据把记录覆盖,再用新数据写入到记录文件中
Utils.writeStringToFile(siteData, "", false);
Utils.writeCollectionToFile(siteData, sites);
Utils.writeStringToFile(allUrlData, "", false);
Utils.writeCollectionToFile(allUrlData, pageSet);
Utils.writeStringToFile(currentUrlData, "", false);
Utils.writeCollectionToFile(currentUrlData, pageLinks);
Utils.writeStringToFile(lazyUrlData, "", false);
Utils.writeCollectionToFile(lazyUrlData, lazyPageLinks);
Utils.writeStringToFile(downLoadedImgData, "", false);
Utils.writeCollectionToFile(downLoadedImgData, imgDownloaded);
Utils.writeStringToFile(downloadedImgDirNameData, "", false);
Utils.writeCollectionToFile(downloadedImgDirNameData, imgDownloadedDirName);
Utils.writeStringToFile(readTimeOutImgData, "", false);
Utils.writePictInfoCollectionToFile(readTimeOutImgData, readTimeOutImgs);
}
}

View File

@ -0,0 +1,47 @@
package com.hitoli.fetchPic;
public class PictInfo {
private String site;
private String url;
private String title;
private String htmlUrl;
public PictInfo(String site, String title, String url, String htmlUrl) {
this.site = null == site ? "" : site;
this.title = null == title ? "" : title;
this.url = null == url ? "" : url;
this.htmlUrl = null == htmlUrl ? "" : htmlUrl;
}
public String getSite() {
return site;
}
public void setSite(String site) {
this.site = site;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getHtmlUrl() {
return htmlUrl;
}
public void setHtmlUrl(String htmlUrl) {
this.htmlUrl = htmlUrl;
}
}

View File

@ -0,0 +1,622 @@
package com.hitoli.fetchPic;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.logging.FileHandler;
import java.util.logging.Formatter;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
public class Utils {
private static Logger loginfo = null;
private static Logger logerror = null;
static class MyLogHander extends Formatter {
@Override
public String format(LogRecord record) {
return LocalDateTime.ofEpochSecond(record.getMillis()/1000, 0, ZoneOffset.ofHours(8)) +
" " + record.getLevel() + " : " + record.getMessage()+"\n";
}
}
private static void createLogger() {
loginfo = Logger.getLogger("fetchPicLog-info");
loginfo.setLevel(Level.ALL);
logerror = Logger.getLogger("fetchPicLog-error");
logerror.setLevel(Level.WARNING);
// ConsoleHandler consoleHandler = new ConsoleHandler();
// consoleHandler.setLevel(Level.ALL);
// loginfo.addHandler(consoleHandler);
// logerror.addHandler(consoleHandler);
FileHandler logInfoFileHandler = null;
FileHandler logErrorFileHandler = null;
try {
logInfoFileHandler = new FileHandler(Main.downLoadDir + File.separator + "fetchPicLog-info.log");
logErrorFileHandler = new FileHandler(Main.downLoadDir + File.separator + "fetchPicLog-error.log");
} catch (IOException e) {
e.printStackTrace();
}
if (null != logInfoFileHandler) {
logInfoFileHandler.setLevel(Level.INFO);
logInfoFileHandler.setFormatter(new MyLogHander());
loginfo.addHandler(logInfoFileHandler);
} else {
System.out.println("创建信息日志文件失败");
}
if (null != logErrorFileHandler) {
logErrorFileHandler.setLevel(Level.WARNING);
logErrorFileHandler.setFormatter(new MyLogHander());
logerror.addHandler(logErrorFileHandler);
} else {
System.out.println("创建错误日志文件失败");
}
}
public static void info(String msg) {
if (null == loginfo) {
createLogger();
}
loginfo.info(msg);
}
public static void error(String msg) {
if (null == logerror) {
createLogger();
}
logerror.warning(msg);
}
/**
* 创建目录
* @param dir
*/
public static void createDir(String dir){
File file = new File(dir);
if(!file.exists()){
file.mkdir();
}
}
/**
* 创建新的图片目录
* @param dir
*/
public static void createNewImgDir(String dir){
File file = new File(dir);
if(!file.exists()){
file.mkdir();
Main.imgDownloadedDirName.add(file.getName() + "#####" + file.getPath());
}
}
/**
* 读取目录下的目录
* @param dir
* @return
*/
public static List<File> getDirs(String dir) {
File file = new File(dir);
if(file.exists()){
return Arrays.asList(file.listFiles()).stream().filter(f -> !f.isDirectory()).collect(Collectors.toList());
}
return Collections.emptyList();
}
/**
* 删除目录
* @param dir
*/
public static void delDir(String dir){
File file = new File(dir);
if(file.exists() && file.isDirectory()){
for (File f : file.listFiles()) {
delFile(f.getPath());
}
file.delete();
}
}
/**
* 创建文件
* @param fielPath
*/
public static void createFile(String fielPath){
File file = new File(fielPath);
if(!file.exists()){
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
}
}
}
/**
* 删除文件
* @param filePath
*/
public static void delFile(String filePath){
File file = new File(filePath);
if(file.exists() && file.isFile()){
file.delete();
}
}
/**
* 检查指定路径是否存在
* @param filePath
* @return
*/
public static boolean checkFilePathExists(String filePath) {
return new File(filePath).exists();
}
/**
* 读取文件信息到集合中(一行一条数据)
* @param filePath
* @param collection
*/
public static void readFileDataToCollection(String filePath, Collection<String> collection) {
InputStreamReader in = null;
BufferedReader br = null;
try {
in = new InputStreamReader(new FileInputStream(new File(filePath)));
br = new BufferedReader(in);
String line;
while ((line = br.readLine()) != null) {
if (StringUtils.isNotEmpty(line)) {
collection.add(line);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} finally {
closeIO(in, br, null, null);
}
}
/**
* 读取文件图片信息对象到集合中(一行一条数据)
* @param filePath
* @param collection
*/
public static void readFilePictInfoDataToCollection(String filePath, Collection<PictInfo> collection) {
InputStreamReader in = null;
BufferedReader br = null;
try {
in = new InputStreamReader(new FileInputStream(new File(filePath)));
br = new BufferedReader(in);
String line;
while ((line = br.readLine()) != null) {
if (StringUtils.isNotEmpty(line)) {
String[] datas = line.split("#####");
if (datas.length >= 3) {
PictInfo pictInfo = new PictInfo(datas[0], datas[1], datas[2], (datas.length == 3 ? null : datas[3]));
collection.add(pictInfo);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} finally {
closeIO(in, br, null, null);
}
}
/**
* 把字符串写入文件(一行一条),默认追加写入
* @param filePath
* @param str
*/
public static void writeStringToFile(String filePath, String str) {
writeStringToFile(filePath, str, true);
}
/**
* 把集合中的数据写到文件中(一行一条)
* @param filePath
* @param collection
*/
public static void writeCollectionToFile(String filePath, Collection<String> collection) {
if (null != collection && !collection.isEmpty()) {
FileWriter fw = null;
BufferedWriter out = null;
try {
fw = new FileWriter(new File(filePath), true);
out = new BufferedWriter(fw);
for (String str : collection) {
out.write(str += "\r\n");
}
out.flush();
} catch (FileNotFoundException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} finally {
closeIO(null, null, fw, out);
}
}
}
/**
* 把集合中的图片对象数据写到文件中(一行一条)
* @param filePath
* @param collection
*/
public static void writePictInfoCollectionToFile(String filePath, Collection<PictInfo> collection) {
if (null != collection && !collection.isEmpty()) {
FileWriter fw = null;
BufferedWriter out = null;
try {
fw = new FileWriter(new File(filePath), true);
out = new BufferedWriter(fw);
for (PictInfo pictInfo : collection) {
String str = pictInfo.getSite() + "#####" + pictInfo.getTitle() + "#####" + pictInfo.getUrl();
out.write(str += "\r\n");
}
out.flush();
} catch (FileNotFoundException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} finally {
closeIO(null, null, fw, out);
}
}
}
/**
* 把字符串写入文件(一行一条)
* @param filePath
* @param str
* @param append 是否追加
*/
public static void writeStringToFile(String filePath, String str, boolean append) {
FileWriter fw = null;
BufferedWriter out = null;
try {
fw = new FileWriter(new File(filePath), append);
out = new BufferedWriter(fw);
if (!(StringUtils.isEmpty(str) && !append)) { //如果字符串为空并且不追加,则只写入空的字符串(不带换行符)
str += "\r\n";
}
out.write(str);
out.flush();
} catch (FileNotFoundException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
} finally {
closeIO(null, null, fw, out);
}
}
private static void closeIO(InputStreamReader in, BufferedReader br, FileWriter fw, BufferedWriter out) {
try {
if (br != null) {
br.close();
}
if (in != null) {
in.close();
}
if (out != null) {
out.close();
}
if (fw != null) {
fw.close();
}
} catch (IOException e) {
e.printStackTrace();
Utils.error(e.getMessage());
}
}
/**
* 获取域名(无多余的后缀)
* @param context
* @return
*/
public static String getSite(String context) {
String site = "";
if (context.startsWith("http")) {
int start = 7;
String _start = "http://";
if (context.startsWith("https")) {
start = 8;
_start = "https://";
}
site = context.substring(start);
if (site.indexOf("/") != -1) {
site = _start + site.substring(0, site.indexOf("/"));
} else {
site = _start + site;
}
} else {
if (context.indexOf("/") != -1) {
site = context.substring(0, context.indexOf("/"));
} else {
site = context;
}
}
return site;
}
/**
* 获取完整地址
* @param site 无多余后缀的网站地址
* @param path 有后缀的地址
* @param url 抓取的地址
* @return 如果抓取的地址开头是/,说明是从根目录开始。如果开头为..则表示从有后缀的地址往前回退
*/
public static String getFullPath(String site, String path, String url) {
if (url.startsWith("//")) {
if (site.startsWith("https")) {
url = "https:" + url;
} else {
url = "http:" +url;
}
} else if (url.startsWith("/")) {
url = site + url;
} else if (url.startsWith("../")) {
url = path + url;
} else if (url.startsWith("./") || (url.indexOf("/") == -1)) {
url = path.substring(0, (path.lastIndexOf("/") + 1)) + url;
}
return url;
}
/**
* 检查url是否在网站域名内防止超出边界
* @param url
* @return
*/
public static Boolean checkSite(String url) {
Boolean result = false;
for (String s : Main.transboundarySites) {//先检查是否在可越界域名内
s = getShortSite(s);
if (url.indexOf(s) != -1) {
result = true;
break;
}
}
if (!result) {
for (String s : Main.sites) {
s = getShortSite(s);
if (url.indexOf(s) != -1) {
result = true;
break;
}
}
}
if (!result) {
info("扫描到的url=" + url + "越界");
}
return result;
}
/**
* 获取网站短域名不包含www及http前缀
* @param site
* @return
*/
public static String getShortSite(String site) {
if (site.startsWith("http")) {
int start = 7;
if (site.startsWith("https")) {
start = 8;
}
site = site.substring(start);
}
return site.substring(site.indexOf(".")+1);
}
/**
* 检查url是否是域名地址
* @param url
* @return
*/
public static boolean isSite(String url) {
boolean result = false;
url = removeHttpOrHttps(url);
if (url.substring(url.length() - 1, url.length()).equals("/")) {
url = url.substring(0, url.length() - 1);
}
for (String site : Main.sites) {
if (url.equalsIgnoreCase(removeHttpOrHttps(site))) {
result = true;
break;
}
}
return result;
}
/**
* 去除http://或https://
* @param url
* @return
*/
public static String removeHttpOrHttps(String url) {
if (url.startsWith("http")) {
int start = 7;
if (url.startsWith("https")) {
start = 8;
}
url = url.substring(start);
}
return url;
}
/**
* 获取可做为文件名的网站短域名不包含www及http前缀点用下划线替换
* @param site
* @return
*/
public static String getUseFileNameShortSite(String site) {
return getShortSite(site).replaceAll("\\.", "_");
}
/**
* 特殊符号去除
* @param str
* @return
*/
public static String specialSymbolRemoval(String str) {
return str.replaceAll("[\\\\/:\\\\*\\\\?\\\\\"<>\\\\|]", "");
}
/**
* 检查text是否"下一页"按钮中的文字
* @param text
* @return
*/
public static boolean isNextPageButton(String text) {
boolean result = false;
for (String nextPageName : Main.nextPageNames) {
if (text.equals(nextPageName)) {
result = true;
break;
}
}
return result;
}
/**
* 创建图片文件
* @param title
* @return
*/
public static File createImgFile(String site, String title, String name, String suffix) throws IOException {
File file = null;
if (null == Main.imgDownloadedDirName || Main.imgDownloadedDirName.isEmpty()) {
file = _createImgFile(site, title, name, suffix);
} else {
Double allowDifferentCharacters = Double.parseDouble(Main.allowDifferentCharacters);
String oldPath = null;
String newTitle = title.replaceAll("(\\[+)|(\\]+)|(\\s)|(\\.+)|(\\_+)|(\\-+)|(\\——+)|(\\-+)|(\\\\d+\\页)|(\\第)|(\\页)", "");
char[] newChar = newTitle.toCharArray();
for (String s : Main.imgDownloadedDirName) {
String[] _s = s.split("#####");
if (_s.length == 2) {
String oldTitle = _s[0].replaceAll("(\\[+)|(\\]+)|(\\s)|(\\.+)|(\\_+)|(\\-+)|(\\——+)|(\\-+)|(\\\\d+\\页)|(\\第)|(\\页)", "");
if (oldTitle.equalsIgnoreCase(newTitle)) {//名称一样
oldPath = _s[1];
break;
}
char[] oldChar = oldTitle.toCharArray();
int difference = 0;
char[] maxLengthChar = null;
char[] minLengthChar = null;
if (oldChar.length >= newChar.length) {
maxLengthChar = oldChar;
minLengthChar = newChar;
} else {
maxLengthChar = newChar;
minLengthChar = oldChar;
}
if (allowDifferentCharacters < 1 && allowDifferentCharacters > 0) {//取相差百分比
allowDifferentCharacters = maxLengthChar.length - (maxLengthChar.length * allowDifferentCharacters);
}
if (maxLengthChar.length != minLengthChar.length &&
((maxLengthChar.length - minLengthChar.length) > allowDifferentCharacters)) {//长度已经超过相差值
continue;
}
for (int j=0; j<maxLengthChar.length; j++) {
if (!String.valueOf(maxLengthChar[j]).equals(String.valueOf(minLengthChar[j]))) {
difference++;
}
}
if (difference <= allowDifferentCharacters) {//判断是否在允许的不同字符数之内
oldPath = _s[1];
break;
}
}
if (StringUtils.isNotEmpty(oldPath)) {
break;
}
}
if (StringUtils.isNotEmpty(oldPath)) {
createDir(oldPath);
file = new File(oldPath + File.separator + name + suffix);
} else {
file = _createImgFile(site, title, name, suffix);
}
}
if (null == file) {
throw new IOException("创建图片文件失败");
}
return file;
}
private static File _createImgFile(String site, String title, String name, String suffix) {
if (!checkFilePathExists(Main.downLoadDir + File.separator + site + File.separator + title)) {
createDir(Main.downLoadDir + File.separator + site);
createNewImgDir(Main.downLoadDir + File.separator + site + File.separator + title);
}
return new File(Main.downLoadDir + File.separator + site
+ File.separator + title + File.separator + name + suffix);
}
}