-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwalker.go
More file actions
123 lines (107 loc) · 3.29 KB
/
walker.go
File metadata and controls
123 lines (107 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package walker
import (
"fmt"
"io"
"net/http"
"net/url"
"sort"
"github.com/PuerkitoBio/goquery"
"github.com/foomo/walker/config"
"github.com/foomo/walker/htmlschema"
"github.com/foomo/walker/reports"
"github.com/foomo/walker/vo"
)
type start struct {
conf config.Config
groupValidator *htmlschema.GroupValidator
linkListFilterFunc LinkListFilterFunc
validationFunc ValidationFunc
scrapeFunc ScrapeFunc
scrapeResultModifierFunc ScrapeResultModifierFunc
}
type started struct {
Err error
ChanLoopComplete chan vo.Status
}
type sortLenStrings []string
func (p sortLenStrings) Len() int { return len(p) }
func (p sortLenStrings) Less(i, j int) bool { return len(p[i]) > len(p[j]) }
func (p sortLenStrings) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func sortPathsByLength(paths []string) []string {
sls := make(sortLenStrings, len(paths))
copy(sls, paths)
sort.Sort(sls)
return []string(sls)
}
type LinkListFilterFunc func(baseURL, docURL *url.URL, doc *goquery.Document) (ll vo.LinkList, err error)
type ScrapeFunc func(response *http.Response) (scarepeData interface{}, err error)
type ScrapeResultModifierFunc func(result vo.ScrapeResult) (modifiedResult vo.ScrapeResult, err error)
type ValidationFunc func(structure vo.Structure, scrapeData interface{}) (vo.Validations, error)
type Walker struct {
chanResult chan scrapeResultAndClient
chanStart chan start
chanStatus chan vo.Status
chanStop chan vo.Status
chanStarted chan started
CompleteStatus *vo.Status
}
func NewWalker() *Walker {
w := &Walker{
chanResult: make(chan scrapeResultAndClient),
chanStart: make(chan start),
chanStop: make(chan vo.Status),
chanStatus: make(chan vo.Status),
chanStarted: make(chan started),
}
go w.scrapeloop()
return w
}
func (w *Walker) Walk(
conf *config.Config,
linkListFilter LinkListFilterFunc,
scrapeFunc ScrapeFunc,
validationFunc ValidationFunc,
scrapeResultModifierFunc ScrapeResultModifierFunc,
) (chanLoopStatus chan vo.Status, err error) {
var groupValidator *htmlschema.GroupValidator
if conf.SchemaRoot != "" {
gv, errGroupValidator := htmlschema.NewGroupValidator(conf.SchemaRoot)
if errGroupValidator != nil {
return nil, errGroupValidator
}
groupValidator = gv
}
w.chanStart <- start{
groupValidator: groupValidator,
conf: *conf,
scrapeFunc: scrapeFunc,
linkListFilterFunc: linkListFilter,
validationFunc: validationFunc,
scrapeResultModifierFunc: scrapeResultModifierFunc,
}
st := <-w.chanStarted
return st.ChanLoopComplete, st.Err
}
func (w *Walker) Stop() vo.Status {
w.chanStop <- vo.Status{}
return <-w.chanStop
}
func (w *Walker) GetStatus() vo.Status {
w.chanStatus <- vo.Status{}
return <-w.chanStatus
}
func line(w io.Writer) {
fmt.Fprintln(w, "------------------------------------------------------------------------")
}
func headline(w io.Writer, v ...interface{}) {
fmt.Fprintln(w)
fmt.Fprintln(w, v...)
line(w)
}
func (wlkr *Walker) GetReportHandler(basePath string) http.HandlerFunc {
h := reports.GetReportHandler(basePath)
return func(w http.ResponseWriter, r *http.Request) {
runningStatus := wlkr.GetStatus()
h(w, r, wlkr.CompleteStatus, &runningStatus)
}
}