-
Notifications
You must be signed in to change notification settings - Fork 0
/
AnalyseResource.h
69 lines (50 loc) · 1.9 KB
/
AnalyseResource.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#ifndef ANALYSERESOURCE_H
#define ANALYSERESOURCE_H
#include <set>
#include <vector>
#include <unordered_map>
#include <functional>
#include "DownloadResource.h"
#include "gumbo-parser-master/src/gumbo.h"
struct AnalyseResults
{
std::set<URL> links;
std::vector<std::pair<std::string, URL>> backlinks;
std::unordered_map<std::string, unsigned int> words;
std::string full_text;
};
struct UnknownContentType
{
ContentType content_type;
};
using ContentAnalyser = std::function<AnalyseResults(WebRessource const&)>;
class Analyzer
{
public:
Analyzer() = default;
AnalyseResults operator()(WebRessource const& webres);
void setAnalyser(ContentType const& content_type, ContentAnalyser analyser);
private:
std::unordered_map<ContentType, ContentAnalyser> m_analysers;
};
using HTMLAnalyser = std::function<void(WebRessource const&, AnalyseResults&, GumboNode*)>;
class TextHTMLAnalyser
{
public:
TextHTMLAnalyser() = default;
AnalyseResults operator()(WebRessource const& webres);
void addAnalyser(HTMLAnalyser analyser);
private:
std::vector<HTMLAnalyser> m_analysers;
void analyse_node(WebRessource const& webres, AnalyseResults& results, GumboNode* node);
};
void search_for_text(WebRessource const& webres, AnalyseResults& results, GumboNode* node);
void search_for_links(WebRessource const& webres, AnalyseResults& results, GumboNode* node);
void search_for_words(WebRessource const& webres, AnalyseResults& results, GumboNode* node);
std::function<bool(WebRessource const& webres, AnalyseResults& results, GumboNode* node)>
generate_is_inside_tag(GumboTag tag);
std::function<void(WebRessource const& webres, AnalyseResults& results, GumboNode* node)>
generate_skip_tag(GumboTag tag);
void search_for_img(WebRessource const& webres, AnalyseResults& results, GumboNode* node);
AnalyseResults analyse_ftp_file(WebRessource const& webres);
#endif // ANALYSERESOURCE_H