Commit 0ef8139d authored by mirako's avatar mirako
Browse files

Assignment 2

parent 15c1c38f
......@@ -8,29 +8,22 @@ import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
public class Crawler {
private final static Logger LOGGER = Logger.getLogger(Crawler.class.getName());
public Crawler() {
LOGGER.setLevel(Level.ALL);
}
public class Crawler {
public Report analyze(Document webpage, int depth) {
public Report analyze(Document webpage, int depth) throws InvalidUrlException, HtmlParseException {
Document subWebpage;
List<Report> subReports = new ArrayList<>();
Report subReport;
Report report = new Report();
report.setUrl(webpage.baseUri());
report.setLinks(getAllLinks(webpage, depth));
report.setLinks(getAllLinks(webpage));
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos(webpage));
report.setWordCount(countWords(webpage));
report.setBrokenLinks(findBrokenLinks(getAllLinks(webpage, depth)));
report.setBrokenLinks(findBrokenLinks(getAllLinks(webpage)));
if(depth > 0){
for (String link: report.getLinks()){
......@@ -85,27 +78,32 @@ public class Crawler {
System.out.println(tabs+line);
}
public Document getDocumentFromURL(String url) throws RuntimeException{
public Document getDocumentFromURL(String url) throws RuntimeException, InvalidUrlException, HtmlParseException {
Document webpage = null;
if(url != null && !url.isEmpty()){
if(stringIsValidHtmlFile(url)){
File input = new File(url);
try {
webpage = Jsoup.parse(input, "UTF-8", "http://example.com/");
}catch (Exception ex){
LOGGER.log(Level.SEVERE, ex.getMessage());
}
}else if(stringIsValidURL(url)){
try {
webpage = Jsoup.connect(url).get();
}catch (IOException ioe){
LOGGER.log(Level.SEVERE, ioe.getMessage());
if (url != null && !url.isEmpty()) {
if (stringIsValidHtmlFile(url)) {
File input = new File(url);
try {
webpage = Jsoup.parse(input, "UTF-8", "http://example.com/");
} catch (IOException ex) {
throw new HtmlParseException(url);
}
} else if (stringIsValidURL(url)) {
try {
webpage = Jsoup.connect(url).get();
} catch (IOException ioe) {
throw new HtmlParseException(url);
}
} else {
throw new InvalidUrlException(url);
}
} else {
throw new InvalidUrlException(" ");
}
}else{
throw new RuntimeException("URL is empty");
}
return webpage;
return webpage;
}
private Integer countWords(Document webpage){
......@@ -121,7 +119,7 @@ public class Crawler {
return webpage.getElementsByTag("video").size();
}
private List<String> getAllLinks(Document webpage, int depth) {
private List<String> getAllLinks(Document webpage) {
List<String> links = new ArrayList<>();
for (Element link : webpage.getElementsByTag("a")) {
String href = link.attr("href");
......
package assignment;
public class HtmlParseException extends Exception{
public HtmlParseException(String url){
super(url);
}
}
package assignment;
public class InvalidUrlException extends Exception {
public InvalidUrlException(String url){
super(url);
}
}
package assignment;
import org.apache.commons.cli.*;
import org.jsoup.nodes.Document;
import java.util.logging.Level;
import java.util.logging.Logger;
public class Main {
private final static Logger LOGGER = Logger.getLogger(Main.class.getName());
public static void main(String[] args){
Options options = new Options();
CommandLine cmd = null;
......@@ -12,29 +15,41 @@ public class Main {
Integer depth = 0;
Crawler crawler = new Crawler();
Report report;
Document webpage;
options.addOption(urlArg);
options.addOption(depthArg);
urlArg.setArgs(Option.UNLIMITED_VALUES);
CommandLineParser parser = new DefaultParser();
try {
cmd = parser.parse(options, args);
} catch (MissingArgumentException e) {
LOGGER.log(Level.SEVERE, "At least one URL has to be given");
System.exit(1);
} catch (ParseException e) {
e.printStackTrace();
LOGGER.log(Level.SEVERE, "Failure on parsing arguments");
System.exit(1);
}
assert cmd != null;
if(cmd.hasOption("depth")){
depth = Integer.valueOf(cmd.getOptionValue("depth"));
}
if(cmd.hasOption("url")){
webpage = crawler.getDocumentFromURL(cmd.getOptionValue("url"));
report = crawler.analyze(webpage, depth);
String[] urls = cmd.getOptionValues("url");
//Print Report to Console
crawler.printReports(report);
for(String url : urls){
try {
report = crawler.analyze(crawler.getDocumentFromURL(url), depth);
crawler.printReports(report);
}catch (HtmlParseException e){
LOGGER.log(Level.WARNING, "Parsing `" + e.getMessage() + "` failed.");
}catch (InvalidUrlException e){
LOGGER.log(Level.WARNING, "Invalid URL `" + e.getMessage() + "` ");
}
}
}
}
}
import assignment.Crawler;
import assignment.HtmlParseException;
import assignment.InvalidUrlException;
import assignment.Report;
import org.jsoup.nodes.Document;
import org.jsoup.parser.HtmlTreeBuilder;
import org.junit.Assert;
import org.junit.jupiter.api.*;
......@@ -24,15 +27,14 @@ public class CrawlerTest {
@Test
@Order(1)
public void testGetDocumentFromURL(){
public void testGetDocumentFromURL() throws HtmlParseException, InvalidUrlException {
Assert.assertNotNull(crawler.getDocumentFromURL(testFilePath));
}
@Test
@Order(2)
public void testGetDocumentFromURLOnEmptyString(){
Assert.assertThrows(RuntimeException.class, () ->{
Assert.assertThrows(InvalidUrlException.class, () ->{
crawler.getDocumentFromURL("");
});
}
......@@ -40,46 +42,63 @@ public class CrawlerTest {
@Test
@Order(3)
public void testGetDocumentFromURLOnNull(){
Assert.assertThrows(RuntimeException.class, () ->{
Assert.assertThrows(InvalidUrlException.class, () ->{
crawler.getDocumentFromURL(null);
});
}
@Test
public void testWordCount(){
public void testWordCount() throws HtmlParseException, InvalidUrlException {
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(305,report.getWordCount().longValue());
}
@Test
public void testImageCount(){
public void testImageCount() throws HtmlParseException, InvalidUrlException {
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(2,report.getImageCount().longValue());
}
@Test
public void testVideoCount(){
public void testVideoCount() throws HtmlParseException, InvalidUrlException {
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(1,report.getVideoCount().longValue());
}
@Test
public void testLinkCount(){
public void testLinkCount() throws HtmlParseException, InvalidUrlException {
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(3,report.getLinks().size());
}
@Test
public void testBrokenLinkCount(){
public void testBrokenLinkCount() throws HtmlParseException, InvalidUrlException {
Document webpage = crawler.getDocumentFromURL(testFilePath);
report = crawler.analyze(webpage, 0);
Assert.assertEquals(1, report.getBrokenLinks().size());
}
@Test
public void testInvalidUrlException(){
Assert.assertThrows(
InvalidUrlException.class, () -> {
crawler.getDocumentFromURL("https:// wj");
});
}
@Test
public void testHtmlParseException(){
Assert.assertThrows(
HtmlParseException.class, () -> {
crawler.getDocumentFromURL("https://aau.t");
}
);
}
@AfterAll
public static void tearDown(){
crawler = null;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment