Skip to content

Commit

Permalink
Add Papiersaal
Browse files Browse the repository at this point in the history
Fixes #40
  • Loading branch information
jakopako committed Feb 1, 2022
1 parent 1e6acca commit bb88300
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 9 deletions.
42 changes: 41 additions & 1 deletion config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -576,4 576,44 @@ crawlers:
regex_ignore: ".*abgesagt.*"
- field: "title"
regex_ignore: "geschlossene Gesellschaft"


- name: Papiersaal
type: concert
city: Zurich
url: "https://www.papiersaal.ch/"
event: ".single-act"
fields:
title:
loc: ".col-sm-8 h2 a strong"
url:
loc: ".col-sm-8 h2 a"
relative: true
on_subpage: ["comment"]
comment:
loc: ".col-lg-12 p"
node_index: 2
max_length: 200
date:
day_month_year:
loc: ".col-sm-8 h3"
layout: "Monday, 2. Jan 2006"
regex_extract:
exp: "[A-Za-z] ,\\s[0-9]{1,2}\\.\\s[\u00e4A-Za-z]{3,4}\\s[0-9]{4}"
time:
loc: ".col-sm-8 i"
layout: "15h04"
child_index: 3 # we only need this because multiple children match the regex.
regex_extract:
exp: "[0-9]{2}h[0-9]{2}"
location: "Europe/Berlin"
language: "de_DE"
paginator:
loc: ".pull-right"
relative: true
filters:
- field: "title"
regex_ignore: ".*Abgesagt.*"
- field: "title"
regex_ignore: ".*Verschoben.*"
- field: "title"
regex_ignore: ".*VERSCHOBEN.*"
3 changes: 3 additions & 0 deletions example-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 69,9 @@ crawlers:
# Sometimes multiple nodes match the given selector. Then an index can be given to chose the right one.
# -1 means the last node. The default is 0
node_index: -1
# we only need this if multiple (text) children match the regex or if no regex is given and we need a specific child.
# Checkout the Papiersaal crawler config for a real life example. Currently, all keys under 'date' can define this field.
child_index: 3
location: "Europe/Berlin" # The time zone of this event location
language: "de_DE" # the language that the date & time string(s) are written in.
filters: # A list of filters to remove certain items from the event list.
Expand Down
22 changes: 14 additions & 8 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,21 318,26 @@ func extractField(item string, s *goquery.Selection, crawler *Crawler, event *Ev
func getDateStringAndLayout(dl *DateField, s *goquery.Selection) (string, string) {
var fieldString, fieldLayout string
fieldStringSelection := s.Find(dl.Loc)
// TODO: Add possibility to apply a regex across s.Find(dl.Loc).Text()
// A bit hacky..
if len(fieldStringSelection.Nodes) > 0 {
if dl.Attr == "" {
currentChildIndex := 0
fieldStringNode := fieldStringSelection.Get(dl.NodeIndex).FirstChild
for fieldStringNode != nil {
if fieldStringNode.Type == html.TextNode {
// we 'abuse' the extractStringRegex func to find the correct text element.
var err error
fieldString, err = extractStringRegex(&dl.RegexExtract, fieldStringNode.Data)
if err == nil {
break
// If the cild index is 0 (default value if not explicitly defined) we loop over all the children.
// This makes it easier if there are many children and only one matches the regex. If only one
// matches the regex then the child index can even differ inbetween various events.
// Plus we do not need to change existing crawler configs.
if currentChildIndex == dl.ChildIndex || dl.ChildIndex == 0 {
if fieldStringNode.Type == html.TextNode {
var err error
fieldString, err = extractStringRegex(&dl.RegexExtract, fieldStringNode.Data)
if err == nil {
break
}
}
}
fieldStringNode = fieldStringNode.NextSibling
currentChildIndex = 1
}
} else {
fieldString = fieldStringSelection.AttrOr(dl.Attr, "")
Expand Down Expand Up @@ -475,6 480,7 @@ type DateField struct {
Loc string `yaml:"loc"`
Layout string `yaml:"layout"`
NodeIndex int `yaml:"node_index"`
ChildIndex int `yaml:"child_index"`
RegexExtract RegexConfig `yaml:"regex_extract"`
Attr string `yaml:"attr"`
}
Expand Down

0 comments on commit bb88300

Please sign in to comment.