-
Notifications
You must be signed in to change notification settings - Fork 33
/
base_parse.py
executable file
·121 lines (98 loc) · 3.82 KB
/
base_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python2
"""Module for BaseParse"""
from scrapers import *
class BaseParse(object):
"""BaseParse class
Contains read json data, and parsed html data for the scrapers
to use. Also contains general methods to initialize the scrape.
Args:
link (str): link to the project page to scrape
Attributes:
json_data (dict): read json data from auth_data.json
soup (obj): BeautifulSoup obj containing parsed link
dir_name (str): directory name of the link
"""
def __init__(self, link=""):
self.htbn_link = link
self.json_data = self.get_json()
self.soup = self.get_soup()
self.dir_name = self.find_directory()
@property
def htbn_link(self):
return self.__htbn_link
@htbn_link.setter
def htbn_link(self, value):
"""Setter for htbn link
Must contain holberton's url format for projects.
Args:
value (str): comes from argv[1] as the project link
"""
valid_link = "intranet.hbtn.io/projects"
while not (valid_link in value):
print("[ERROR] Invalid link (must be to project on intranet.hbtn.io)")
value = raw_input("Enter link to project: ")
self.__htbn_link = value
def get_json(self):
"""Method that reads auth_data.json.
Sets json read to `json_data`
"""
super_path = os.path.dirname(os.path.abspath(__file__))
try:
with open("{}/auth_data.json".format(super_path.rsplit("/", 1)[0]), "r") as json_file:
return json.load(json_file)
except IOError:
print("[ERROR] Please run ./setup.sh to setup your auth data...")
sys.exit()
def get_soup(self):
"""Method that parses the `htbn_link` with BeautifulSoup
Initially logs in the intranet using mechanize and cookiejar.
Then requests for the html of the link, and sets it into `soup`.
Returns:
soup (obj): BeautifulSoup parsed html object
"""
login = "https://intranet.hbtn.io/auth/sign_in"
cj = cookielib.CookieJar()
br = mechanize.Browser()
sys.stdout.write(" -> Logging in... ")
try:
br.set_cookiejar(cj)
br.open(login)
br.select_form(nr=0)
br.form['user[login]'] = self.json_data["intra_user_key"]
br.form['user[password]'] = self.json_data["intra_pass_key"]
br.submit()
page = br.open(self.__htbn_link)
except AttributeError:
print("[ERROR] Login failed - are your auth_data credentials correct?")
sys.exit()
print("done")
self.soup = BeautifulSoup(page, 'html.parser')
br.close()
del page
return self.soup
def find_directory(self):
"""Method that scrapes for project's directory name
Sets project's directory's name to `dir_name`
"""
find_dir = self.soup.find(string=re.compile("Directory: "))
find_dir_text = find_dir.next_element.text
return find_dir_text
def create_directory(self):
"""Method that creates appropriate directory"""
sys.stdout.write(" -> Creating directory... ")
try:
os.mkdir(self.dir_name)
os.chdir(self.dir_name)
print("done")
except OSError:
print("[ERROR] Failed to create directory - does it already exist?")
sys.exit()
def project_type_check(self):
"""Method that checks the project's type
Checks for which scraper to use by scraping 'Github repository: '
Returns:
project (str): scraped project type
"""
find_project = self.soup.find(string=re.compile("GitHub repository: "))
project = find_project.next_sibling.text
return project