Last active
January 6, 2023 21:39
-
-
Save VIRUXE/77cbeded77c8333edd4f20fc326d0537 to your computer and use it in GitHub Desktop.
Scrape a GTA V Vehicle page from GTABase.com to make ready for database insertion
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################################################ | |
# Author: VIRUXE | |
# github.com/VIRUXE | |
# Scrape GTA V vehicle data from https://www.gtabase.com/grand-theft-auto-v/vehicles/ | |
############################################################################################################ | |
import sys | |
import os | |
from decouple import config | |
import requests | |
from bs4 import BeautifulSoup | |
import random | |
import json | |
import MySQLdb as MySQL | |
# Base URL to scrape from | |
base_url = "https://www.gtabase.com/grand-theft-auto-v/vehicles/" | |
# Enum with all database columns | |
class Properties: | |
MODEL = "model" | |
CLASS = "class" | |
REALNAME = "realname" | |
BRAND = "brand" | |
SEATS = "seats" | |
PRICE = "price" | |
DLC = "dlc" | |
WEIGHT = "weight" | |
GEARS = "gears" | |
DRIVETRAIN = "drivetrain" | |
labels = { | |
MODEL : "Model ID", | |
CLASS : "Vehicle Class", | |
REALNAME : "Based on (Real Life)", | |
BRAND : "Manufacturer", | |
SEATS : "Seats", | |
PRICE : ["Story Mode Price", "GTA Online Price"], | |
DLC : "DLC / Title Update", | |
WEIGHT : "Mass / Weight", | |
GEARS : "Gears", | |
DRIVETRAIN: "Drive Train", | |
} | |
def GetColumnFromLabel(label): | |
for column, labels in Properties.labels.items(): | |
if label in labels: | |
return column | |
return None | |
# Connect to MySQL database, using ENV variables for credentials | |
db = MySQL.connect( | |
host = config("DB_HOST"), | |
user = config("DB_USER"), | |
password = config("DB_PASS"), | |
database = config("DB_NAME") | |
) | |
cursor = db.cursor() | |
def ScrapeVehicle(url_string): | |
# If url_string contains https then it's a full URL, otherwise it's just the slug for the model ID | |
vehicle_url = url_string if "https" in url_string else base_url url_string | |
file_path = "pages/" vehicle_url.split('/')[-1] ".html" | |
# Check if we saved the HTML content for this vehicle/model already | |
if os.path.isfile(file_path): | |
print(f"Loading HTML content from {file_path}") | |
with open(file_path, "r", encoding="utf-8") as f: | |
html_content = f.read() | |
else: | |
print(f"Scraping vehicle data from {vehicle_url}") | |
# Get the HTML content of the vehicle page | |
r = requests.get(vehicle_url) | |
if r.status_code != 200: | |
dash_index = url_string.find("-") | |
if dash_index == -1: # If there's no dash, then we can't do anything else | |
print(f"\033[31mError: {r.status_code}, skipping\033[0m") | |
return | |
# Find the first dash and remove everything before it | |
url_string = url_string[dash_index 1 :] | |
print(f"\033[33mError: {r.status_code}, trying {url_string}\033[0m") | |
vehicle_url = base_url url_string | |
r = requests.get(vehicle_url) | |
if r.status_code != 200: | |
print(f"\033[31mError: {r.status_code}, skipping\033[0m") | |
return | |
html_content = r.text | |
# Save the HTML content to a file | |
# Create the pages directory if it doesn't exist | |
if not os.path.isdir("pages"): | |
os.mkdir("pages") | |
# Save the content to use later as "cache", so we don't have to scrape the same page again | |
with open(file_path, "w", encoding="utf-8") as f: | |
f.write(html_content) | |
print(f"Saved HTML content to {file_path}") | |
# We'll store all the vehicle data in this dict | |
vehicle = dict() | |
# Get dat gud gud soup | |
# Loop through all description details | |
for dd in BeautifulSoup(html_content, "html.parser").find_all("dd"): | |
# Only do something if this element has atleast 2 spans | |
spans = dd.find_all("span") | |
if len(spans) >= 2: | |
# Get the first span and remove the colon at the end | |
label = spans[0].text.strip() | |
column = Properties.GetColumnFromLabel(label) | |
if not column: continue # Skip if we don't have a column for this label. It just means we don't need this data | |
# Get the second span and remove the whitespace at the beginning and end | |
value = spans[1].text.strip() | |
if column == Properties.REALNAME: | |
value = value.replace("'", "\\'") # Escape single quotes | |
elif column == Properties.CLASS: | |
value = value.lower() # Our db enum is lowercase | |
values = value.split(",") # Some vehicles have multiple classes for some reason | |
value = values[0] # We only want the first one, which is the main class | |
class_enum = { | |
'boats' : 'boatsmilitary', | |
'commercial' : 'commercialvans', | |
'compacts' : None, | |
'coupes' : None, | |
'cycles' : None, | |
'emergency' : ['emergencysuvs', 'emergencyvans', 'emergencymotorcycles', 'helicoptersemergency', 'boatsemergency'], | |
'helicopters' : None, | |
'industrial' : 'industrialoff-road', | |
'military' : ['commercialmilitary', 'militarysuvs', 'remote controlmilitary', 'militaryutility'], | |
'motorcycles' : ['motorcyclesoff-road', 'motorcyclesutility'], | |
'muscle' : 'muscletuners', | |
'off-road' : ['off-roadservice', 'off-roadvans', 'remote controloff-road', 'off-roadsports classics'], | |
'planes' : None, | |
'sedans' : 'sedanssports', | |
'service' : None, | |
'sports' : None, | |
'sportsclassics': ['sports classics', 'sportssports classics'], | |
'supers' : 'super', | |
'suvs' : 'off-roadsuvs', | |
'trailer' : None, | |
'trains' : None, | |
'utility' : None, | |
'vans' : None, | |
'sportstuners' : ['coupestuners', 'sedanstuners'], | |
'offroad-suvs' : None, | |
'openwheel' : 'open wheel' | |
} | |
for k, v in class_enum.items(): | |
if isinstance(v, list): | |
if value in v: | |
value = k | |
break | |
elif isinstance(v, str): | |
if value == v: | |
value = k | |
break | |
elif column == Properties.BRAND: | |
if value == "Jacksheepe": # * This is the only exception in the entire brand enum | |
value = "Jack Sheepe" | |
elif column == Properties.PRICE: | |
# Remove dollar sign and commas | |
value = value.replace("$", "").replace(",", "") | |
elif column == Properties.DLC: | |
dlc_enum = { # * This is not critical, but would be nice to have | |
'mpbeach' : None, | |
'mpvalentines' : None, | |
'mpbusiness' : None, | |
'mpbusiness2' : None, | |
'mphipster' : None, | |
'mpindependence' : None, | |
'spupgrade' : None, | |
'mppilot' : None, | |
'mplts' : None, | |
'mpchristmas2' : None, | |
'mpheist' : None, | |
'mpluxe' : None, | |
'mpluxe2' : None, | |
'mplowrider' : None, | |
'mphalloween' : None, | |
'mpapartment' : None, | |
'mpxmas_604490' : None, | |
'mpjanuary2016' : None, | |
'mpvalentines2' : None, | |
'mplowrider2' : None, | |
'mpexecutive' : None, | |
'mpstunt' : None, | |
'mpbiker' : None, | |
'mpimportexport' : None, | |
'mpspecialraces' : None, | |
'mpgunrunning' : None, | |
'mpsmuggler' : None, | |
'mpchristmas2017' : None, | |
'mpassault' : None, | |
'mpbattle' : None, | |
'mpchristmas2018' : None, | |
'mpvinewood' : None, | |
'mpheist3' : '1.21 Heists DLC', # ! Confirm this. | |
'mpsum' : None, | |
'mpheist4' : None, | |
'mptuner' : None, | |
'mpsecurity' : None, | |
'mpg9ec' : None, | |
'mpsum2' : None, | |
'mpsum2_g9ec' : None, | |
'mpchristmas3' : None, | |
'mpchristmas3_g9ec': None | |
} | |
found = False | |
for k, v in dlc_enum.items(): | |
if value == v: | |
value = k | |
found = True | |
break | |
if not found: value = None | |
elif column == Properties.WEIGHT: | |
# Remove the kg at the end | |
value = value.replace("KG", "") | |
# Remove comma | |
value = value.replace(",", "") | |
# Add the column and value if there is an actual value | |
if column not in vehicle and not value == None: vehicle[column] = value.strip() | |
# Print the vehicle data pretty | |
for column, value in vehicle.items(): print(f"\033[1m{column}:\033[0m {value}") | |
# Update vehicle using the model ID | |
query = f"UPDATE vehicles_metadata SET " | |
# Build up the query | |
for column, value in vehicle.items(): | |
if column == Properties.MODEL: continue # Skip the model column. We'll use it in the WHERE clause | |
query = f"{column} = '{value}', " | |
if Properties.MODEL not in vehicle: # If we don't have a model, we can't update the vehicle | |
print(vehicle) | |
return | |
# Split Properties.MODEL using a comma to check if we have multiple models | |
models = vehicle[Properties.MODEL].split(",") | |
# If we have multiple models, we need to use the IN operator | |
if len(models) > 1: | |
print(f"\033[33mVehicle {vehicle[Properties.MODEL]} has multiple models, using IN operator\033[0m") | |
# Replace all commas with a single quote, a comma and another single quote | |
vehicle[Properties.MODEL] = vehicle[Properties.MODEL].replace(", ", "','") | |
query = query[:-2] f" WHERE model IN ('{vehicle[Properties.MODEL]}')" | |
else: | |
query = query[:-2] f" WHERE model = '{vehicle[Properties.MODEL]}'" | |
print(f"Executing query: {query}") | |
# Execute the query | |
cursor.execute(query) | |
# Check if the query was successful | |
if cursor.rowcount == 0: | |
print(f"\033[31mFailed to update vehicle {vehicle[Properties.MODEL]}\033[0m") | |
else: | |
print(f"\033[32mSuccessfully updated vehicle {vehicle[Properties.MODEL]}\033[0m") | |
# Commit the changes to the database | |
db.commit() | |
return True | |
if len(sys.argv) < 2: # No argument was passed so we load models from a JSON file called "gtabase_model_urls.json" | |
with open("gtabase_model_urls.json", "r") as f: | |
models = json.load(f) | |
# Scrape each vehicle model | |
current_model = 0 | |
for model in models: | |
current_model = 1 | |
print(f"\nScraping vehicle {current_model}/{len(models)}\n") | |
if ScrapeVehicle(model) == False: print(f"\033[31mFailed to scrape vehicle {model[0]}\033[0m") | |
else: | |
# Every argument is a vehicle url | |
for url in sys.argv[1:]: | |
ScrapeVehicle(url) | |
# Close the database connection | |
cursor.close() | |
db.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment