-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cricinfo_DataExtraction.R
66 lines (50 loc) · 1.79 KB
/
Cricinfo_DataExtraction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
library(XML)
setwd("F:/Krupan/Documents/2_Freelance/3_Github_Projects/IPL_Team_Auction_Analytics/")
#***************************************Read All Matches Scorecard URLs****************************************************************
ipl2016url = "http://www.espncricinfo.com/indian-premier-league-2016/engine/series/968923.html"
urls = XML::getHTMLLinks(ipl2016url)
urls = unique(urls)
scorecardUrls = as.character(urls[grep("/indian-premier-league-2016/engine/match/",urls)])
scorecardUrls = paste0("http://www.espncricinfo.com",scorecardUrls)
str(scorecardUrls)
library(XML)
library(RCurl)
xData <- getURL(ipl2016url)
doc <- xmlParse(xData)
doc <- htmlTreeParse(ipl2016url)
xmldoc = xmlRoot(xmlParse(doc[3]))
doc[3]
xmlAttrs(xmlRoot(doc$children[[4]]))
td <- xmlRoot(xmlParse(html))
xmlElementsByTagName(doc$children[[3]], "team-1-name")
# Encode to UTF-8 for proper display of text
Encoding(doc.text) = "UTF-8"
doc.text = as.character(doc.text)
fn_getScoreCard = function(url)
{
tables=readHTMLTable(url,stringsAsFactors = F)
tables.length = length(tables)
names(tables)[1:tables.length] = paste("Table",1:tables.length,sep="")
}
matchurl="http://www.espncricinfo.com/indian-premier-league-2016/engine/match/981019.html"
tables=readHTMLTable(matchurl,stringsAsFactors = F)
tables.length = length(tables)
names(tables)[1:tables.length] = paste("Table",1:tables.length,sep="")
names(tables)
# Table 1 - 1st Innings Batting
# Table 2 - 1st Innings Bowling
# Table 3 - 2nd Innings Batting
# Table 4 - 2nd Innings Bowling
doc.html = XML::htmlTreeParse(matchurl, useInternal = TRUE)
str(doc.html)
doc.text = unlist(XML::xpathApply(doc.html, '//a', xmlAttrs))
tables[1]
str(tables)
a = tables[2]
a = a$Table2
View(a)
is.na(a$`O`)
a = subset(a, !is.na(a$`O`))
a$` ` = NULL
a$` .1`= NULL
View(na.omit(a))