Commit 327e9824 authored by Michael Koscher's avatar Michael Koscher
Browse files

Tests improved and local file support

parent fee718dc
......@@ -40,4 +40,33 @@
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>
Main
</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.net.UnknownHostException;
import java.net.URL;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
public class Crawler {
private final static Logger LOGGER = Logger.getLogger(Crawler.class.getName());
public Crawler() {
LOGGER.setLevel(Level.ALL);
}
public Report analyze(Document webpage, int depth) {
Document subWebpage;
List<Report> subReports = new ArrayList<>();
Report subReport;
Report report = new Report();
report.setUrl(webpage.baseUri());
report.setDeph(depth);
report.setLinks(getAllLinks(webpage, depth));
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos(webpage));
report.setWordCount(countWords(webpage));
report.setBrokenLinks(findBrokenLinks(getAllLinks(webpage, depth)));
printReport(report);
if(depth > 0){
for (String link: report.getLinks()){
if(!isBrokenLink(link)){
subWebpage = loadDocumentFromWebpage(link);
analyze(subWebpage, depth - 1);
subWebpage = getDocumentFromURL(link);
subReport = analyze(subWebpage, depth - 1);
subReports.add(subReport);
}
}
}
report.setSubReports(subReports);
//Print Report to Console
printReports(report);
return report;
}
private void printReport(Report report){
System.out.println("Report for URL: " + report.getUrl());
System.out.println("Depht is: " + report.getDeph());
System.out.println("-------------------------------------------- ");
System.out.println("Words: " + report.getWordCount());
System.out.println("Images: " + report.getImageCount());
System.out.println("Videos: " + report.getVideoCount());
System.out.println("Links: " + report.getLinks().size());
System.out.println("Broken Links: " + report.getBrokenLinks().size());
System.out.println("-------------------------------------------- ");
System.out.println("-------------------------------------------- ");
System.out.println("");
private void printReports(Report report){
printReportLineForDepth("Report for depht: 0" , 0);
printReports(report, 0);
}
public Document loadDocumentFromWebpage(String url){
try {
return Jsoup.connect(url).get();
}catch (IOException e){
private void printReports(Report report, int depth){
printReportLineForDepth("URL: " + report.getUrl(),depth);
printReportLineForDepth("-------------------------------------------- ",depth);
printReportLineForDepth("Words: " + report.getWordCount(),depth);
printReportLineForDepth("Images: " + report.getImageCount(),depth);
printReportLineForDepth("Videos: " + report.getVideoCount(),depth);
printReportLineForDepth("Links: " + report.getLinks().size(),depth);
printReportLineForDepth("Broken Links: " + report.getBrokenLinks().size(),depth);
printReportLineForDepth("--------------------------------------------",depth);
printReportLineForDepth("############################################",depth);
printReportLineForDepth("--------------------------------------------",depth);
printReportLineForDepth("",depth);
if(!report.getSubReports().isEmpty()){
printReportLineForDepth("Report for depht: " + (depth + 1) , depth + 1);
printReportLineForDepth("-------------------------------------------- ",depth + 1);
printReportLineForDepth("-------------------------------------------- ",depth + 1);
printReportLineForDepth("",depth + 1);
}
for(Report subReport: report.getSubReports()){
printReports(subReport,depth + 1);
}
}
private void printReportLineForDepth(String line, int depth){
String tabs = "";
for(int i=0; i < depth; i++){
tabs += "\t";
}
return null;
System.out.println(tabs+line);
}
public Document getDocumentFromURL(String url) throws RuntimeException{
Document webpage = null;
if(url != null && !url.isEmpty()){
if(stringIsValidHtmlFile(url)){
File input = new File(url);
try {
webpage = Jsoup.parse(input, "UTF-8", "http://example.com/");
}catch (Exception ex){
LOGGER.log(Level.SEVERE, ex.getMessage());
}
}else if(stringIsValidURL(url)){
try {
webpage = Jsoup.connect(url).get();
}catch (IOException ioe){
LOGGER.log(Level.SEVERE, ioe.getMessage());
}
}
}else{
throw new RuntimeException("URL is empty");
}
return webpage;
}
private Integer countWords(Document webpage){
......@@ -74,42 +118,31 @@ public class Crawler {
}
private List<String> getAllLinks(Document webpage, int depth) {
List<String> links = new ArrayList<String>();
// if(depth > 0) {
List<String> links = new ArrayList<>();
for (Element link : webpage.getElementsByTag("a")) {
String href = link.attr("href");
// if (checkString(href)) {
// try {
// doc = Jsoup.connect(href).get();
// } catch (IOException e) {
// continue;
// } catch (IllegalArgumentException iae){
// continue;
// }
//
// } else {
// File input = new File(href);
// try {
// doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
// }catch (Exception ex){
//
// }
// }
links.add(href);
// if(doc != null) {
// links.addAll(getAllLinks(doc, depth - 1));
// }
}
// }
return links;
}
private boolean checkString(String href){
if(href.startsWith("/")){ return true; }
if(href.startsWith("http://")){ return true; }
if(href.startsWith("https://")){ return true; }
if(href.startsWith("www")){ return true; }
private boolean stringIsValidURL(String url){
try {
new URL(url).toURI();
return true;
}catch (Exception ex){
return false;
}
}
private boolean stringIsValidHtmlFile(String url){
if(url.endsWith(".html") || url.endsWith(".htm")) {
File file = new File(url);
if (file.exists() && file.isFile()){
return true;
}
return false;
}
return false;
}
......@@ -125,16 +158,15 @@ public class Crawler {
private boolean isBrokenLink(String url){
try {
if(checkString(url)) {
if(stringIsValidURL(url)) {
Jsoup.connect(url).ignoreContentType(true).execute();
}else{
return true;
return false;
}else if(stringIsValidHtmlFile(url)) {
return false;
}
return false;
} catch (HttpStatusException | UnknownHostException e1) {
return true;
} catch (IOException e) {
e.printStackTrace();
} catch (IOException ioe) {
LOGGER.log(Level.INFO, ioe.getMessage());
return true;
}
}
......
import org.apache.commons.cli.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public class Main {
public static void main(String[] args){
Options options = new Options();
......@@ -29,10 +25,9 @@ public class Main {
depth = Integer.valueOf(cmd.getOptionValue("depth"));
}
//if(cmd.hasOption("url")){
webpage = crawler.loadDocumentFromWebpage("https://designyoursmile.at/");
if(cmd.hasOption("url")){
webpage = crawler.getDocumentFromURL(cmd.getOptionValue("url"));
crawler.analyze(webpage, depth);
//}
}
}
}
......@@ -8,6 +8,7 @@ public class Report {
private Integer wordCount;
private List<String> links;
private List<String> brokenLinks;
private List<Report> subReports;
public String getUrl() {
return url;
......@@ -17,14 +18,6 @@ public class Report {
this.url = url;
}
public Integer getDeph() {
return deph;
}
public void setDeph(Integer deph) {
this.deph = deph;
}
public Integer getImageCount() {
return imageCount;
}
......@@ -64,4 +57,12 @@ public class Report {
public void setBrokenLinks(List<String> brokenLinks) {
this.brokenLinks = brokenLinks;
}
public List<Report> getSubReports() {
return subReports;
}
public void setSubReports(List<Report> subReports) {
this.subReports = subReports;
}
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.*;
import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import static org.junit.Assert.assertEquals;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class CrawlerTest {
public static Crawler crawler1 = new Crawler();
public static Crawler crawler2;
public static Report report1;
public static Report report2;
public static Document webpage;
public static Crawler crawler;
public static String testFilePath;
@BeforeAll
public static void init(){
Path resourceDirectory = Paths.get("src","test","resources","testPage.html");
File input = new File(resourceDirectory.toUri());
try {
webpage = Jsoup.parse(input, "UTF-8", "http://example.com/");
}catch (Exception ex){
testFilePath = resourceDirectory.toString();
}
crawler = new Crawler();
}
crawler1 = new Crawler();
crawler2 = new Crawler();
report1 = new Report();
report2 = new Report();
@Test
@Order(1)
public void testGetDocumentFromURL(){
Assert.assertNotNull(crawler.getDocumentFromURL(testFilePath));
}
@Test
public void testAnalyzeEmptyURL(){
Assert.assertThrows(IllegalArgumentException.class, () ->{
crawler1.analyze(null, 0);
@Order(2)
public void testGetDocumentFromURLOnEmptyString(){
Assert.assertThrows(RuntimeException.class, () ->{
crawler.getDocumentFromURL("");
});
}
@Test
@Order(3)
public void testGetDocumentFromURLOnNull(){
Assert.assertThrows(RuntimeException.class, () ->{
crawler.getDocumentFromURL(null);
});
}
@Test
public void testWordCount(){
Report report;
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(305L,report.getWordCount().longValue());
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(305,report.getWordCount().longValue());
}
@Test
public void testImageCount(){
Report report;
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(2L,report.getImageCount().longValue());
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(2,report.getImageCount().longValue());
}
@Test
public void testVideoCount(){
Report report;
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(1L,report.getVideoCount().longValue());
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(1,report.getVideoCount().longValue());
}
@Test
public void testLinkCount(){
Report report;
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(3L,report.getLinks().size());
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(3,report.getLinks().size());
}
@Test
public void testBrokenLinkCount(){
Report report;
report = crawler1.analyze(webpage, 0);
Assert.assertEquals(1L, report.getBrokenLinks().size());
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(1, report.getBrokenLinks().size());
}
@AfterAll
public static void tearDown(){
crawler1 = null;
crawler2 = null;
report1 = null;
report2 = null;
crawler = null;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment