Commit 8603f8db authored by mirako's avatar mirako
Browse files

Analyze

parent 6b8653d5
......@@ -2,30 +2,44 @@ import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
public class Crawler {
public int countLinks;
public int countWords;
public int countImages;
public int countVideos;
public int getLinks;
public int setLinks;
Report report = new Report();
//for the recursive method
public static Set<String> uniqueURL = new HashSet<String>();
public static String myPage;
//
public void analyzeWebpage(String url, Integer depth){
// Document webpage = Jsoup.connect(url).get();
// webpage.getElementsByTag("img").size();
Document webpage = null;
try {
webpage = Jsoup.connect("https://www.aau.at").get();
} catch (IOException e) {
e.printStackTrace();
}
//for the recursive method
Crawler obj = new Crawler();
myPage = "aau.com";
obj.analyze("http://stackoverflow.com/", depth);
// Integer countImg = webpage.getElementsByTag("img").size();
// System.out.println("Images Counted: "+countImg);
//
......@@ -33,12 +47,13 @@ public class Crawler {
// System.out.println(link.attr("href"));
// });
//
countLinks = webpage.getElementsByTag("a").size();
countWords = countWords(webpage);
countImages = countImages(webpage);
countVideos = countVideos(webpage);
getLinks = getLinks(webpage).size();
/**
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos);
report.setWordCount(countWords(webpage));
report.setLinks(getLinks(webpage));
*/
System.out.println("Count LInks: " + webpage.getElementsByTag("a").size());
......@@ -50,6 +65,40 @@ public class Crawler {
}
private void analyze(String url, int depth) {
depth--;
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
Elements links = doc.select("a");
if (links.isEmpty()) {
return;
}
int finalDepth = depth;
links.stream().map((link) -> link.attr("abs:href")).forEachOrdered((this_url) -> {
boolean add = uniqueURL.add(this_url);
if (add && this_url.contains(myPage)) {
try {
Document newWebpage = Jsoup.connect(this_url).get();
countImages(newWebpage);
countVideos(newWebpage);
countWords(newWebpage);
getLinks(newWebpage);
findBrokenLinks(getLinks(newWebpage));
} catch (IOException e) {
e.printStackTrace();
}
analyze(this_url, finalDepth);
}
});
} catch (IOException ex) {
}
}
private Integer countWords(Document webpage){
String text = webpage.text();
return text.split(" ").length;
......@@ -82,7 +131,6 @@ public class Crawler {
}
private boolean checkLink(String url){
try {
Connection.Response response = Jsoup.connect(url).ignoreContentType(true).execute();
return false;
......@@ -93,8 +141,6 @@ public class Crawler {
} catch (IOException e) {
e.printStackTrace();
}
return true;
return false;
}
}
......@@ -13,6 +13,7 @@ public class Main {
Option depthArg = Option.builder().longOpt("depth").argName("d").hasArg().desc("Depth to parse").build();
Integer depth = 2;
Crawler crawler = new Crawler();
Report report = new Report();
options.addOption(urlArg);
options.addOption(depthArg);
......@@ -34,10 +35,10 @@ public class Main {
System.out.println("Count Links: " + crawler.countLinks);
System.out.println("Words: " + crawler.countWords);
System.out.println("Words: " + report.getWordCount());
System.out.println("Images: " + crawler.countImages);
System.out.println("Videos: " + crawler.countVideos);
System.out.println("Links: " + crawler.getLinks);
System.out.println("Links: " + crawler.setLinks);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment