Commit f75103ef authored by Michael Koscher's avatar Michael Koscher
Browse files

finish output

parent ec10efac
......@@ -7,6 +7,7 @@
<groupId>org.example</groupId>
<artifactId>cc-assignment-1</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<maven.compiler.source>11</maven.compiler.source>
......
......@@ -5,70 +5,52 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.*;
public class Crawler {
public String countLinks;
public String countImages;
public String countVideos;
public String setLinks;
Report report = new Report();
/*
public void analyzeWebpage(String url, Integer depth){
// Document webpage = Jsoup.connect(url).get();
// webpage.getElementsByTag("img").size();
Document webpage = null;
try {
webpage = Jsoup.connect("https://www.aau.at").get();
} catch (IOException e) {
e.printStackTrace();
}
//for the recursive method
Crawler obj = new Crawler();
myPage = "aau.com";
obj.analyze("http://stackoverflow.com/", depth);
// Integer countImg = webpage.getElementsByTag("img").size();
// System.out.println("Images Counted: "+countImg);
//
// webpage.getElementsByTag("a").forEach(link -> {
// System.out.println(link.attr("href"));
// });
//
/**
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos);
report.setWordCount(countWords(webpage));
report.setLinks(getLinks(webpage));
System.out.println("Count LInks: " + webpage.getElementsByTag("a").size());
System.out.println("Words: " + countWords(webpage));
System.out.println("Images: " + countImages(webpage));
System.out.println("Videos: " + countVideos(webpage));
System.out.println("Links: " + getLinks(webpage).size());
System.out.println("Broken Links: " + findBrokenLinks(getLinks(webpage)).size());
}
*/
public Report analyze(Document webpage, int depth) {
Document subWebpage;
Report report = new Report();
report.setUrl(webpage.baseUri());
report.setDeph(depth);
report.setLinks(getAllLinks(webpage, depth));
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos(webpage));
report.setWordCount(countWords(webpage));
report.setBrokenLinks(findBrokenLinks(getAllLinks(webpage, depth)));
printReport(report);
if(depth > 0){
for (String link: report.getLinks()){
if(!isBrokenLink(link)){
subWebpage = loadDocumentFromWebpage(link);
analyze(subWebpage, depth - 1);
}
}
}
return report;
}
private void printReport(Report report){
System.out.println("Report for URL: " + report.getUrl());
System.out.println("Depht is: " + report.getDeph());
System.out.println("-------------------------------------------- ");
System.out.println("Words: " + report.getWordCount());
System.out.println("Images: " + report.getImageCount());
System.out.println("Videos: " + report.getVideoCount());
System.out.println("Links: " + report.getLinks().size());
System.out.println("Broken Links: " + report.getBrokenLinks().size());
System.out.println("-------------------------------------------- ");
System.out.println("-------------------------------------------- ");
System.out.println("");
}
public Document loadDocumentFromWebpage(String url){
try {
return Jsoup.connect(url).get();
......@@ -92,23 +74,45 @@ public class Crawler {
}
private List<String> getAllLinks(Document webpage, int depth) {
List<String> links = new ArrayList<>(); //Nur weils ein Link ist, braucht man keine Linked List ;)
if(depth >= 0) {
List<String> links = new ArrayList<String>();
// if(depth > 0) {
for (Element link : webpage.getElementsByTag("a")) {
String href = link.attr("href");
Document doc = null;
try {
doc = Jsoup.connect(href).get();
}catch(IOException e){
continue;
}
// if (checkString(href)) {
// try {
// doc = Jsoup.connect(href).get();
// } catch (IOException e) {
// continue;
// } catch (IllegalArgumentException iae){
// continue;
// }
//
// } else {
// File input = new File(href);
// try {
// doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
// }catch (Exception ex){
//
// }
// }
links.add(href);
links.addAll(getAllLinks(doc, depth-1));
// if(doc != null) {
// links.addAll(getAllLinks(doc, depth - 1));
// }
}
}
// }
return links;
}
private boolean checkString(String href){
if(href.startsWith("/")){ return true; }
if(href.startsWith("http://")){ return true; }
if(href.startsWith("https://")){ return true; }
if(href.startsWith("www")){ return true; }
return false;
}
private List<String> findBrokenLinks(List<String> links){
List<String> brokenLinks = new ArrayList<>();
links.forEach(link -> {
......@@ -121,7 +125,11 @@ public class Crawler {
private boolean isBrokenLink(String url){
try {
Jsoup.connect(url).ignoreContentType(true).execute();
if(checkString(url)) {
Jsoup.connect(url).ignoreContentType(true).execute();
}else{
return true;
}
return false;
} catch (HttpStatusException | UnknownHostException e1) {
return true;
......
......@@ -11,9 +11,8 @@ public class Main {
CommandLine cmd = null;
Option urlArg = Option.builder().longOpt("url").argName("u").hasArg().desc("URL to parse").build();
Option depthArg = Option.builder().longOpt("depth").argName("d").hasArg().desc("Depth to parse").build();
Integer depth = 2;
Integer depth = 1;
Crawler crawler = new Crawler();
Report report = new Report();
Document webpage;
options.addOption(urlArg);
......@@ -30,18 +29,10 @@ public class Main {
depth = Integer.valueOf(cmd.getOptionValue("depth"));
}
if(cmd.hasOption("url")){
webpage = crawler.loadDocumentFromWebpage(cmd.getOptionValue("url"));
crawler.analyze(webpage, depth);
}
System.out.println("Count Links: " + crawler.countLinks);
System.out.println("Words: " + report.getWordCount());
System.out.println("Images: " + crawler.countImages);
System.out.println("Videos: " + crawler.countVideos);
System.out.println("Links: " + crawler.setLinks);
//if(cmd.hasOption("url")){
webpage = crawler.loadDocumentFromWebpage("https://designyoursmile.at/");
crawler.analyze(webpage, depth);
//}
}
}
import java.util.List;
public class Report {
private String url;
private Integer deph;
private Integer imageCount;
private Integer videoCount;
private Integer wordCount;
private List<String> links;
private List<String> brokenLinks;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Integer getDeph() {
return deph;
}
public void setDeph(Integer deph) {
this.deph = deph;
}
public Integer getImageCount() {
return imageCount;
}
......
......@@ -2,6 +2,7 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
......@@ -40,47 +41,47 @@ public class CrawlerTest {
@Test
public void testAnalyzeEmptyURL(){
Assert.assertThrows(IllegalArgumentException.class, () ->{
crawler1.analyze(null, 1);
crawler1.analyze(null, 0);
});
}
@Test
public void testWordCount(){
Report report;
report = crawler1.analyze(webpage, 1);
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(305L,report.getWordCount().longValue());
}
@Test
public void testImageCount(){
Report report;
report = crawler1.analyze(webpage, 1);
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(2L,report.getImageCount().longValue());
}
@Test
public void testVideoCount(){
Report report;
report = crawler1.analyze(webpage, 1);
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(1L,report.getVideoCount().longValue());
}
@Test
public void testLinkCount(){
Report report;
report = crawler1.analyze(webpage, 1);
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(3L,report.getLinks().size());
}
@Test
public void testBrokenLinkCount(){
Report report;
report = crawler1.analyze(webpage, 1);
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(1L, report.getBrokenLinks().size());
}
@AfterEach
public void tearDown(){
@AfterAll
public static void tearDown(){
crawler1 = null;
crawler2 = null;
report1 = null;
......
......@@ -16,8 +16,8 @@
</p>
<img src="aauLogo.png">
<ul>
<li><a href="testPage.html">Test Link 1</a> </li>
<li><a href="testPage.html">Test Link 2</a> </li>
<li><a href="src/test/resources/testPage.html">Test Link 1</a> </li>
<li><a href="src/test/resources/testPage.html">Test Link 2</a> </li>
<li><a href="https://broken.link">Broken Link</a> </li>
</ul>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment