Skip to content

Instantly share code, notes, and snippets.

@VIRUXE
Last active January 6, 2023 21:39
Show Gist options
  • Save VIRUXE/77cbeded77c8333edd4f20fc326d0537 to your computer and use it in GitHub Desktop.
Save VIRUXE/77cbeded77c8333edd4f20fc326d0537 to your computer and use it in GitHub Desktop.
Scrape a GTA V Vehicle page from GTABase.com to make ready for database insertion
############################################################################################################
# Author: VIRUXE
# github.com/VIRUXE
# Scrape GTA V vehicle data from https://www.gtabase.com/grand-theft-auto-v/vehicles/
############################################################################################################
import sys
import os
from decouple import config
import requests
from bs4 import BeautifulSoup
import random
import json
import MySQLdb as MySQL
# Base URL to scrape from
base_url = "https://www.gtabase.com/grand-theft-auto-v/vehicles/"
# Enum with all database columns
class Properties:
MODEL = "model"
CLASS = "class"
REALNAME = "realname"
BRAND = "brand"
SEATS = "seats"
PRICE = "price"
DLC = "dlc"
WEIGHT = "weight"
GEARS = "gears"
DRIVETRAIN = "drivetrain"
labels = {
MODEL : "Model ID",
CLASS : "Vehicle Class",
REALNAME : "Based on (Real Life)",
BRAND : "Manufacturer",
SEATS : "Seats",
PRICE : ["Story Mode Price", "GTA Online Price"],
DLC : "DLC / Title Update",
WEIGHT : "Mass / Weight",
GEARS : "Gears",
DRIVETRAIN: "Drive Train",
}
def GetColumnFromLabel(label):
for column, labels in Properties.labels.items():
if label in labels:
return column
return None
# Connect to MySQL database, using ENV variables for credentials
db = MySQL.connect(
host = config("DB_HOST"),
user = config("DB_USER"),
password = config("DB_PASS"),
database = config("DB_NAME")
)
cursor = db.cursor()
def ScrapeVehicle(url_string):
# If url_string contains https then it's a full URL, otherwise it's just the slug for the model ID
vehicle_url = url_string if "https" in url_string else base_url url_string
file_path = "pages/" vehicle_url.split('/')[-1] ".html"
# Check if we saved the HTML content for this vehicle/model already
if os.path.isfile(file_path):
print(f"Loading HTML content from {file_path}")
with open(file_path, "r", encoding="utf-8") as f:
html_content = f.read()
else:
print(f"Scraping vehicle data from {vehicle_url}")
# Get the HTML content of the vehicle page
r = requests.get(vehicle_url)
if r.status_code != 200:
dash_index = url_string.find("-")
if dash_index == -1: # If there's no dash, then we can't do anything else
print(f"\033[31mError: {r.status_code}, skipping\033[0m")
return
# Find the first dash and remove everything before it
url_string = url_string[dash_index 1 :]
print(f"\033[33mError: {r.status_code}, trying {url_string}\033[0m")
vehicle_url = base_url url_string
r = requests.get(vehicle_url)
if r.status_code != 200:
print(f"\033[31mError: {r.status_code}, skipping\033[0m")
return
html_content = r.text
# Save the HTML content to a file
# Create the pages directory if it doesn't exist
if not os.path.isdir("pages"):
os.mkdir("pages")
# Save the content to use later as "cache", so we don't have to scrape the same page again
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"Saved HTML content to {file_path}")
# We'll store all the vehicle data in this dict
vehicle = dict()
# Get dat gud gud soup
# Loop through all description details
for dd in BeautifulSoup(html_content, "html.parser").find_all("dd"):
# Only do something if this element has atleast 2 spans
spans = dd.find_all("span")
if len(spans) >= 2:
# Get the first span and remove the colon at the end
label = spans[0].text.strip()
column = Properties.GetColumnFromLabel(label)
if not column: continue # Skip if we don't have a column for this label. It just means we don't need this data
# Get the second span and remove the whitespace at the beginning and end
value = spans[1].text.strip()
if column == Properties.REALNAME:
value = value.replace("'", "\\'") # Escape single quotes
elif column == Properties.CLASS:
value = value.lower() # Our db enum is lowercase
values = value.split(",") # Some vehicles have multiple classes for some reason
value = values[0] # We only want the first one, which is the main class
class_enum = {
'boats' : 'boatsmilitary',
'commercial' : 'commercialvans',
'compacts' : None,
'coupes' : None,
'cycles' : None,
'emergency' : ['emergencysuvs', 'emergencyvans', 'emergencymotorcycles', 'helicoptersemergency', 'boatsemergency'],
'helicopters' : None,
'industrial' : 'industrialoff-road',
'military' : ['commercialmilitary', 'militarysuvs', 'remote controlmilitary', 'militaryutility'],
'motorcycles' : ['motorcyclesoff-road', 'motorcyclesutility'],
'muscle' : 'muscletuners',
'off-road' : ['off-roadservice', 'off-roadvans', 'remote controloff-road', 'off-roadsports classics'],
'planes' : None,
'sedans' : 'sedanssports',
'service' : None,
'sports' : None,
'sportsclassics': ['sports classics', 'sportssports classics'],
'supers' : 'super',
'suvs' : 'off-roadsuvs',
'trailer' : None,
'trains' : None,
'utility' : None,
'vans' : None,
'sportstuners' : ['coupestuners', 'sedanstuners'],
'offroad-suvs' : None,
'openwheel' : 'open wheel'
}
for k, v in class_enum.items():
if isinstance(v, list):
if value in v:
value = k
break
elif isinstance(v, str):
if value == v:
value = k
break
elif column == Properties.BRAND:
if value == "Jacksheepe": # * This is the only exception in the entire brand enum
value = "Jack Sheepe"
elif column == Properties.PRICE:
# Remove dollar sign and commas
value = value.replace("$", "").replace(",", "")
elif column == Properties.DLC:
dlc_enum = { # * This is not critical, but would be nice to have
'mpbeach' : None,
'mpvalentines' : None,
'mpbusiness' : None,
'mpbusiness2' : None,
'mphipster' : None,
'mpindependence' : None,
'spupgrade' : None,
'mppilot' : None,
'mplts' : None,
'mpchristmas2' : None,
'mpheist' : None,
'mpluxe' : None,
'mpluxe2' : None,
'mplowrider' : None,
'mphalloween' : None,
'mpapartment' : None,
'mpxmas_604490' : None,
'mpjanuary2016' : None,
'mpvalentines2' : None,
'mplowrider2' : None,
'mpexecutive' : None,
'mpstunt' : None,
'mpbiker' : None,
'mpimportexport' : None,
'mpspecialraces' : None,
'mpgunrunning' : None,
'mpsmuggler' : None,
'mpchristmas2017' : None,
'mpassault' : None,
'mpbattle' : None,
'mpchristmas2018' : None,
'mpvinewood' : None,
'mpheist3' : '1.21 Heists DLC', # ! Confirm this.
'mpsum' : None,
'mpheist4' : None,
'mptuner' : None,
'mpsecurity' : None,
'mpg9ec' : None,
'mpsum2' : None,
'mpsum2_g9ec' : None,
'mpchristmas3' : None,
'mpchristmas3_g9ec': None
}
found = False
for k, v in dlc_enum.items():
if value == v:
value = k
found = True
break
if not found: value = None
elif column == Properties.WEIGHT:
# Remove the kg at the end
value = value.replace("KG", "")
# Remove comma
value = value.replace(",", "")
# Add the column and value if there is an actual value
if column not in vehicle and not value == None: vehicle[column] = value.strip()
# Print the vehicle data pretty
for column, value in vehicle.items(): print(f"\033[1m{column}:\033[0m {value}")
# Update vehicle using the model ID
query = f"UPDATE vehicles_metadata SET "
# Build up the query
for column, value in vehicle.items():
if column == Properties.MODEL: continue # Skip the model column. We'll use it in the WHERE clause
query = f"{column} = '{value}', "
if Properties.MODEL not in vehicle: # If we don't have a model, we can't update the vehicle
print(vehicle)
return
# Split Properties.MODEL using a comma to check if we have multiple models
models = vehicle[Properties.MODEL].split(",")
# If we have multiple models, we need to use the IN operator
if len(models) > 1:
print(f"\033[33mVehicle {vehicle[Properties.MODEL]} has multiple models, using IN operator\033[0m")
# Replace all commas with a single quote, a comma and another single quote
vehicle[Properties.MODEL] = vehicle[Properties.MODEL].replace(", ", "','")
query = query[:-2] f" WHERE model IN ('{vehicle[Properties.MODEL]}')"
else:
query = query[:-2] f" WHERE model = '{vehicle[Properties.MODEL]}'"
print(f"Executing query: {query}")
# Execute the query
cursor.execute(query)
# Check if the query was successful
if cursor.rowcount == 0:
print(f"\033[31mFailed to update vehicle {vehicle[Properties.MODEL]}\033[0m")
else:
print(f"\033[32mSuccessfully updated vehicle {vehicle[Properties.MODEL]}\033[0m")
# Commit the changes to the database
db.commit()
return True
if len(sys.argv) < 2: # No argument was passed so we load models from a JSON file called "gtabase_model_urls.json"
with open("gtabase_model_urls.json", "r") as f:
models = json.load(f)
# Scrape each vehicle model
current_model = 0
for model in models:
current_model = 1
print(f"\nScraping vehicle {current_model}/{len(models)}\n")
if ScrapeVehicle(model) == False: print(f"\033[31mFailed to scrape vehicle {model[0]}\033[0m")
else:
# Every argument is a vehicle url
for url in sys.argv[1:]:
ScrapeVehicle(url)
# Close the database connection
cursor.close()
db.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment