Commit 08e51bd8 authored by mirako's avatar mirako
Browse files

Changing the Crawler

parent 6a548133
......@@ -2,6 +2,7 @@ import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
......@@ -16,17 +17,11 @@ public class Crawler {
Report report = new Report();
//for the recursive method
public static Set<String> uniqueURL = new HashSet<String>();
public static String myPage;
//
/*
public void analyzeWebpage(String url, Integer depth){
// Document webpage = Jsoup.connect(url).get();
// webpage.getElementsByTag("img").size();
Document webpage = null;
try {
webpage = Jsoup.connect("https://www.aau.at").get();
......@@ -53,7 +48,7 @@ public class Crawler {
report.setVideoCount(countVideos);
report.setWordCount(countWords(webpage));
report.setLinks(getLinks(webpage));
*/
System.out.println("Count LInks: " + webpage.getElementsByTag("a").size());
......@@ -62,42 +57,21 @@ public class Crawler {
System.out.println("Videos: " + countVideos(webpage));
System.out.println("Links: " + getLinks(webpage).size());
System.out.println("Broken Links: " + findBrokenLinks(getLinks(webpage)).size());
}
*/
public void analyze(String url, int depth) {
depth--;
Document webpage = null;
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
Elements links = doc.select("a");
if (links.isEmpty()) {
System.out.println("The link was empty");
return;
}
int finalDepth = depth;
links.stream().map((link) -> link.attr("abs:href")).forEachOrdered((this_url) -> {
boolean add = uniqueURL.add(this_url);
if (add && this_url.contains(myPage)) {
try {
Document newWebpage = Jsoup.connect(this_url).get();
report.setImageCount(countImages(newWebpage));
report.setVideoCount(countVideos(newWebpage));
report.setWordCount(countWords(newWebpage));
report.setLinks(getLinks(newWebpage));
report.setBrokenLinks(findBrokenLinks(getLinks(newWebpage)));
} catch (IOException e) {
e.printStackTrace();
}
analyze(this_url, finalDepth);
}
});
} catch (IOException ex) {
webpage = Jsoup.connect(url).get();
}catch (IOException e){
}
report.setLinks(getAllLinks(webpage, depth));
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos(webpage));
report.setWordCount(countWords(webpage));
report.setBrokenLinks(findBrokenLinks(getAllLinks(webpage, depth)));
}
private Integer countWords(Document webpage){
......@@ -113,35 +87,43 @@ public class Crawler {
return webpage.getElementsByTag("video").size();
}
private List<String> getLinks(Document webpage){
List<String> links = new LinkedList<>();
webpage.getElementsByTag("a").forEach(link -> {
links.add(link.attr("href"));
});
private List<String> getAllLinks(Document webpage, int depth) {
List<String> links = new ArrayList<>(); //Nur weils ein Link ist, braucht man keine Linked List ;)
if(depth >= 0) {
for (Element link : webpage.getElementsByTag("a")) {
String href = link.attr("href");
Document doc = null;
try {
doc = Jsoup.connect(href).get();
}catch(IOException e){
continue;
}
links.add(href);
links.addAll(getAllLinks(doc, depth-1));
}
}
return links;
}
private List<String> findBrokenLinks(List<String> links){
List<String> brokenLinks = new ArrayList<>();
links.forEach(link -> {
if(checkLink(link)){
if(isBrokenLink(link)){
brokenLinks.add(link);
}
});
return brokenLinks;
}
private boolean checkLink(String url){
private boolean isBrokenLink(String url){
try {
Connection.Response response = Jsoup.connect(url).ignoreContentType(true).execute();
Jsoup.connect(url).ignoreContentType(true).execute();
return false;
} catch (HttpStatusException e1) {
return true;
} catch (UnknownHostException e2) {
} catch (HttpStatusException | UnknownHostException e1) {
return true;
} catch (IOException e) {
e.printStackTrace();
return true;
}
return false;
}
}
......@@ -36,7 +36,7 @@ public class CrawlerTest {
@Test
public void testWordCount(){
crawler1.analyze("https://www.aau.at/", 0);
crawler1.analyze("https://www.aau.at/", 1);
crawler2.analyze("https://www.aau.at/", 1);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment