Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
mkoscher
CC Assignmen 1
Commits
8603f8db
Commit
8603f8db
authored
Apr 05, 2021
by
mirako
Browse files
Analyze
parent
6b8653d5
Changes
4
Hide whitespace changes
Inline
Side-by-side
src/main/java/Crawler.java
View file @
8603f8db
...
...
@@ -2,30 +2,44 @@ import org.jsoup.Connection;
import
org.jsoup.HttpStatusException
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.select.Elements
;
import
java.io.IOException
;
import
java.net.UnknownHostException
;
import
java.util.ArrayList
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.*
;
public
class
Crawler
{
public
int
countLinks
;
public
int
countWords
;
public
int
countImages
;
public
int
countVideos
;
public
int
getLinks
;
public
int
setLinks
;
Report
report
=
new
Report
();
//for the recursive method
public
static
Set
<
String
>
uniqueURL
=
new
HashSet
<
String
>();
public
static
String
myPage
;
//
public
void
analyzeWebpage
(
String
url
,
Integer
depth
){
// Document webpage = Jsoup.connect(url).get();
// webpage.getElementsByTag("img").size();
Document
webpage
=
null
;
try
{
webpage
=
Jsoup
.
connect
(
"https://www.aau.at"
).
get
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
//for the recursive method
Crawler
obj
=
new
Crawler
();
myPage
=
"aau.com"
;
obj
.
analyze
(
"http://stackoverflow.com/"
,
depth
);
// Integer countImg = webpage.getElementsByTag("img").size();
// System.out.println("Images Counted: "+countImg);
//
...
...
@@ -33,12 +47,13 @@ public class Crawler {
// System.out.println(link.attr("href"));
// });
//
countLinks
=
webpage
.
getElementsByTag
(
"a"
).
size
();
countWords
=
countWords
(
webpage
);
countImages
=
countImages
(
webpage
);
countVideos
=
countVideos
(
webpage
);
getLinks
=
getLinks
(
webpage
).
size
();
/**
report.setImageCount(countImages(webpage));
report.setVideoCount(countVideos);
report.setWordCount(countWords(webpage));
report.setLinks(getLinks(webpage));
*/
System
.
out
.
println
(
"Count LInks: "
+
webpage
.
getElementsByTag
(
"a"
).
size
());
...
...
@@ -50,6 +65,40 @@ public class Crawler {
}
private
void
analyze
(
String
url
,
int
depth
)
{
depth
--;
try
{
Document
doc
=
Jsoup
.
connect
(
url
).
userAgent
(
"Mozilla"
).
get
();
Elements
links
=
doc
.
select
(
"a"
);
if
(
links
.
isEmpty
())
{
return
;
}
int
finalDepth
=
depth
;
links
.
stream
().
map
((
link
)
->
link
.
attr
(
"abs:href"
)).
forEachOrdered
((
this_url
)
->
{
boolean
add
=
uniqueURL
.
add
(
this_url
);
if
(
add
&&
this_url
.
contains
(
myPage
))
{
try
{
Document
newWebpage
=
Jsoup
.
connect
(
this_url
).
get
();
countImages
(
newWebpage
);
countVideos
(
newWebpage
);
countWords
(
newWebpage
);
getLinks
(
newWebpage
);
findBrokenLinks
(
getLinks
(
newWebpage
));
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
analyze
(
this_url
,
finalDepth
);
}
});
}
catch
(
IOException
ex
)
{
}
}
private
Integer
countWords
(
Document
webpage
){
String
text
=
webpage
.
text
();
return
text
.
split
(
" "
).
length
;
...
...
@@ -82,7 +131,6 @@ public class Crawler {
}
private
boolean
checkLink
(
String
url
){
try
{
Connection
.
Response
response
=
Jsoup
.
connect
(
url
).
ignoreContentType
(
true
).
execute
();
return
false
;
...
...
@@ -93,8 +141,6 @@ public class Crawler {
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
return
true
;
return
false
;
}
}
src/main/java/Main.java
View file @
8603f8db
...
...
@@ -13,6 +13,7 @@ public class Main {
Option
depthArg
=
Option
.
builder
().
longOpt
(
"depth"
).
argName
(
"d"
).
hasArg
().
desc
(
"Depth to parse"
).
build
();
Integer
depth
=
2
;
Crawler
crawler
=
new
Crawler
();
Report
report
=
new
Report
();
options
.
addOption
(
urlArg
);
options
.
addOption
(
depthArg
);
...
...
@@ -34,10 +35,10 @@ public class Main {
System
.
out
.
println
(
"Count Links: "
+
crawler
.
countLinks
);
System
.
out
.
println
(
"Words: "
+
crawler
.
countWords
);
System
.
out
.
println
(
"Words: "
+
report
.
getWordCount
()
);
System
.
out
.
println
(
"Images: "
+
crawler
.
countImages
);
System
.
out
.
println
(
"Videos: "
+
crawler
.
countVideos
);
System
.
out
.
println
(
"Links: "
+
crawler
.
g
etLinks
);
System
.
out
.
println
(
"Links: "
+
crawler
.
s
etLinks
);
}
...
...
target/classes/Crawler.class
View file @
8603f8db
No preview for this file type
target/classes/Main.class
View file @
8603f8db
No preview for this file type
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment