VIRUXE · January 6, 2023 21:39
diff --git a/scrape_gtabase_vehicles.py b/scrape_gtabase_vehicles.py
 ############################################################################################################
 # Author: VIRUXE
 # github.com/VIRUXE
 # Scrape GTA V vehicle data from https://www.gtabase.com/grand-theft-auto-v/vehicles/
 ############################################################################################################

 import sys
 import os
 from decouple import config
 import requests
 from bs4 import BeautifulSoup
 import random
 import json
 import MySQLdb as MySQL

 # Base URL to scrape from
 base_url = "https://www.gtabase.com/grand-theft-auto-v/vehicles/"

 # Enum with all database columns
 class Properties:
    MODEL      = "model"
    CLASS      = "class"
    REALNAME   = "realname"
    BRAND      = "brand"
    SEATS      = "seats"
    PRICE      = "price"
    DLC        = "dlc"
    WEIGHT     = "weight"
    GEARS      = "gears"
    DRIVETRAIN = "drivetrain"

    labels = {
        MODEL     : "Model ID",
        CLASS     : "Vehicle Class",
        REALNAME  : "Based on (Real Life)",
        BRAND     : "Manufacturer",
        SEATS     : "Seats",
        PRICE     : ["Story Mode Price", "GTA Online Price"],
        DLC       : "DLC / Title Update",
        WEIGHT    : "Mass / Weight",
        GEARS     : "Gears",
        DRIVETRAIN: "Drive Train",
    }

    def GetColumnFromLabel(label):
        for column, labels in Properties.labels.items():
            if label in labels:
                return column

        return None

 # Connect to MySQL database, using ENV variables for credentials
 db = MySQL.connect(
    host     = config("DB_HOST"),
    user     = config("DB_USER"),
    password = config("DB_PASS"),
    database = config("DB_NAME")
 )
 cursor = db.cursor()

 def ScrapeVehicle(url_string):
    # If url_string contains https then it's a full URL, otherwise it's just the slug for the model ID
    vehicle_url = url_string if "https" in url_string else base_url   url_string
        
    file_path = "pages/"   vehicle_url.split('/')[-1]   ".html"

    # Check if we saved the HTML content for this vehicle/model already
    if os.path.isfile(file_path):
        print(f"Loading HTML content from {file_path}")
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()
    else:
        print(f"Scraping vehicle data from {vehicle_url}")
        # Get the HTML content of the vehicle page
        r = requests.get(vehicle_url)
        if r.status_code != 200:
            dash_index = url_string.find("-")

            if dash_index == -1: # If there's no dash, then we can't do anything else
                print(f"\033[31mError: {r.status_code}, skipping\033[0m")
                return
            
            # Find the first dash and remove everything before it
            url_string = url_string[dash_index   1 :]
            print(f"\033[33mError: {r.status_code}, trying {url_string}\033[0m")
            vehicle_url = base_url   url_string
            r = requests.get(vehicle_url)
            if r.status_code != 200:
                print(f"\033[31mError: {r.status_code}, skipping\033[0m")
                return

        html_content = r.text

        # Save the HTML content to a file
        # Create the pages directory if it doesn't exist
        if not os.path.isdir("pages"):
            os.mkdir("pages")

        # Save the content to use later as "cache", so we don't have to scrape the same page again
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(html_content)
            print(f"Saved HTML content to {file_path}")
            
    # We'll store all the vehicle data in this dict
    vehicle = dict()

    # Get dat gud gud soup
    # Loop through all description details
    for dd in BeautifulSoup(html_content, "html.parser").find_all("dd"):
        # Only do something if this element has atleast 2 spans
        spans = dd.find_all("span")

        if len(spans) >= 2:
            # Get the first span and remove the colon at the end
            label  = spans[0].text.strip()
            column = Properties.GetColumnFromLabel(label)

            if not column: continue # Skip if we don't have a column for this label. It just means we don't need this data

            # Get the second span and remove the whitespace at the beginning and end
            value = spans[1].text.strip()

            if column == Properties.REALNAME: 
                value = value.replace("'", "\\'") # Escape single quotes
            elif column == Properties.CLASS: 
                value  = value.lower() # Our db enum is lowercase
                values = value.split(",") # Some vehicles have multiple classes for some reason
                value  = values[0] # We only want the first one, which is the main class

                class_enum = {
                    'boats'         : 'boatsmilitary',
                    'commercial'    : 'commercialvans',
                    'compacts'      : None,
                    'coupes'        : None,
                    'cycles'        : None,
                    'emergency'     : ['emergencysuvs', 'emergencyvans', 'emergencymotorcycles', 'helicoptersemergency', 'boatsemergency'],
                    'helicopters'   : None,
                    'industrial'    : 'industrialoff-road',
                    'military'      : ['commercialmilitary', 'militarysuvs', 'remote controlmilitary', 'militaryutility'],
                    'motorcycles'   : ['motorcyclesoff-road', 'motorcyclesutility'],
                    'muscle'        : 'muscletuners',
                    'off-road'      : ['off-roadservice', 'off-roadvans', 'remote controloff-road', 'off-roadsports classics'],
                    'planes'        : None,
                    'sedans'        : 'sedanssports',
                    'service'       : None,
                    'sports'        : None,
                    'sportsclassics': ['sports classics', 'sportssports classics'],
                    'supers'        : 'super',
                    'suvs'          : 'off-roadsuvs',
                    'trailer'       : None,
                    'trains'        : None,
                    'utility'       : None,
                    'vans'          : None,
                    'sportstuners'  : ['coupestuners', 'sedanstuners'],
                    'offroad-suvs'  : None,
                    'openwheel'     : 'open wheel'
                }

                for k, v in class_enum.items():
                    if isinstance(v, list):
                        if value in v:
                            value = k
                            break
                    elif isinstance(v, str):
                        if value == v:
                            value = k
                            break
            elif column == Properties.BRAND:
                if value == "Jacksheepe": # * This is the only exception in the entire brand enum
                    value = "Jack Sheepe"
            elif column == Properties.PRICE:
                # Remove dollar sign and commas
                value = value.replace("$", "").replace(",", "")
            elif column == Properties.DLC:
                dlc_enum = { # * This is not critical, but would be nice to have
                    'mpbeach'          : None,
                    'mpvalentines'     : None,
                    'mpbusiness'       : None,
                    'mpbusiness2'      : None,
                    'mphipster'        : None,
                    'mpindependence'   : None,
                    'spupgrade'        : None,
                    'mppilot'          : None,
                    'mplts'            : None,
                    'mpchristmas2'     : None,
                    'mpheist'          : None,
                    'mpluxe'           : None,
                    'mpluxe2'          : None,
                    'mplowrider'       : None,
                    'mphalloween'      : None,
                    'mpapartment'      : None,
                    'mpxmas_604490'    : None,
                    'mpjanuary2016'    : None,
                    'mpvalentines2'    : None,
                    'mplowrider2'      : None,
                    'mpexecutive'      : None,
                    'mpstunt'          : None,
                    'mpbiker'          : None,
                    'mpimportexport'   : None,
                    'mpspecialraces'   : None,
                    'mpgunrunning'     : None,
                    'mpsmuggler'       : None,
                    'mpchristmas2017'  : None,
                    'mpassault'        : None,
                    'mpbattle'         : None,
                    'mpchristmas2018'  : None,
                    'mpvinewood'       : None,
                    'mpheist3'         : '1.21 Heists DLC', # ! Confirm this.
                    'mpsum'            : None,
                    'mpheist4'         : None,
                    'mptuner'          : None,
                    'mpsecurity'       : None,
                    'mpg9ec'           : None,
                    'mpsum2'           : None,
                    'mpsum2_g9ec'      : None,
                    'mpchristmas3'     : None,
                    'mpchristmas3_g9ec': None
                }

                found = False

                for k, v in dlc_enum.items():
                    if value == v:
                        value = k
                        found = True
                        break

                if not found: value = None

            elif column == Properties.WEIGHT:
                # Remove the kg at the end
                value = value.replace("KG", "")
                # Remove comma
                value = value.replace(",", "")

            # Add the column and value if there is an actual value
            if column not in vehicle and not value == None: vehicle[column] = value.strip()

    # Print the vehicle data pretty
    for column, value in vehicle.items(): print(f"\033[1m{column}:\033[0m {value}")

    # Update vehicle using the model ID
    query = f"UPDATE vehicles_metadata SET "

    # Build up the query
    for column, value in vehicle.items():
        if column == Properties.MODEL: continue # Skip the model column. We'll use it in the WHERE clause
        query  = f"{column} = '{value}', "

    if Properties.MODEL not in vehicle: # If we don't have a model, we can't update the vehicle
        print(vehicle)
        return

    # Split Properties.MODEL using a comma to check if we have multiple models
    models = vehicle[Properties.MODEL].split(",")
    # If we have multiple models, we need to use the IN operator
    if len(models) > 1:
        print(f"\033[33mVehicle {vehicle[Properties.MODEL]} has multiple models, using IN operator\033[0m")

        # Replace all commas with a single quote, a comma and another single quote
        vehicle[Properties.MODEL] = vehicle[Properties.MODEL].replace(", ", "','")

        query = query[:-2]   f" WHERE model IN ('{vehicle[Properties.MODEL]}')"
    else:
        query = query[:-2]   f" WHERE model = '{vehicle[Properties.MODEL]}'"

    print(f"Executing query: {query}")

    # Execute the query
    cursor.execute(query)
    # Check if the query was successful
    if cursor.rowcount == 0:
        print(f"\033[31mFailed to update vehicle {vehicle[Properties.MODEL]}\033[0m")
    else:
        print(f"\033[32mSuccessfully updated vehicle {vehicle[Properties.MODEL]}\033[0m")

    # Commit the changes to the database
    db.commit()

    return True

 if len(sys.argv) < 2: # No argument was passed so we load models from a JSON file called "gtabase_model_urls.json"
    with open("gtabase_model_urls.json", "r") as f:
        models = json.load(f)

    # Scrape each vehicle model
    current_model = 0
    for model in models:
        current_model  = 1
        print(f"\nScraping vehicle {current_model}/{len(models)}\n")
        if ScrapeVehicle(model) == False: print(f"\033[31mFailed to scrape vehicle {model[0]}\033[0m")
 else:
    # Every argument is a vehicle url
    for url in sys.argv[1:]:
        ScrapeVehicle(url)

 # Close the database connection
 cursor.close()
 db.close()
	############################################################################################################
	# Author: VIRUXE
	# github.com/VIRUXE
	# Scrape GTA V vehicle data from https://www.gtabase.com/grand-theft-auto-v/vehicles/
	############################################################################################################

	import sys
	import os
	from decouple import config
	import requests
	from bs4 import BeautifulSoup
	import random
	import json
	import MySQLdb as MySQL

	# Base URL to scrape from
	base_url = "https://www.gtabase.com/grand-theft-auto-v/vehicles/"

	# Enum with all database columns
	class Properties:
	MODEL = "model"
	CLASS = "class"
	REALNAME = "realname"
	BRAND = "brand"
	SEATS = "seats"
	PRICE = "price"
	DLC = "dlc"
	WEIGHT = "weight"
	GEARS = "gears"
	DRIVETRAIN = "drivetrain"

	labels = {
	MODEL : "Model ID",
	CLASS : "Vehicle Class",
	REALNAME : "Based on (Real Life)",
	BRAND : "Manufacturer",
	SEATS : "Seats",
	PRICE : ["Story Mode Price", "GTA Online Price"],
	DLC : "DLC / Title Update",
	WEIGHT : "Mass / Weight",
	GEARS : "Gears",
	DRIVETRAIN: "Drive Train",
	}

	def GetColumnFromLabel(label):
	for column, labels in Properties.labels.items():
	if label in labels:
	return column

	return None

	# Connect to MySQL database, using ENV variables for credentials
	db = MySQL.connect(
	host = config("DB_HOST"),
	user = config("DB_USER"),
	password = config("DB_PASS"),
	database = config("DB_NAME")
	)
	cursor = db.cursor()

	def ScrapeVehicle(url_string):
	# If url_string contains https then it's a full URL, otherwise it's just the slug for the model ID
	vehicle_url = url_string if "https" in url_string else base_url url_string

	file_path = "pages/" vehicle_url.split('/')[-1] ".html"

	# Check if we saved the HTML content for this vehicle/model already
	if os.path.isfile(file_path):
	print(f"Loading HTML content from {file_path}")
	with open(file_path, "r", encoding="utf-8") as f:
	html_content = f.read()
	else:
	print(f"Scraping vehicle data from {vehicle_url}")
	# Get the HTML content of the vehicle page
	r = requests.get(vehicle_url)
	if r.status_code != 200:
	dash_index = url_string.find("-")

	if dash_index == -1: # If there's no dash, then we can't do anything else
	print(f"\033[31mError: {r.status_code}, skipping\033[0m")
	return

	# Find the first dash and remove everything before it
	url_string = url_string[dash_index 1 :]
	print(f"\033[33mError: {r.status_code}, trying {url_string}\033[0m")
	vehicle_url = base_url url_string
	r = requests.get(vehicle_url)
	if r.status_code != 200:
	print(f"\033[31mError: {r.status_code}, skipping\033[0m")
	return

	html_content = r.text

	# Save the HTML content to a file
	# Create the pages directory if it doesn't exist
	if not os.path.isdir("pages"):
	os.mkdir("pages")

	# Save the content to use later as "cache", so we don't have to scrape the same page again
	with open(file_path, "w", encoding="utf-8") as f:
	f.write(html_content)
	print(f"Saved HTML content to {file_path}")

	# We'll store all the vehicle data in this dict
	vehicle = dict()

	# Get dat gud gud soup
	# Loop through all description details
	for dd in BeautifulSoup(html_content, "html.parser").find_all("dd"):
	# Only do something if this element has atleast 2 spans
	spans = dd.find_all("span")

	if len(spans) >= 2:
	# Get the first span and remove the colon at the end
	label = spans[0].text.strip()
	column = Properties.GetColumnFromLabel(label)

	if not column: continue # Skip if we don't have a column for this label. It just means we don't need this data

	# Get the second span and remove the whitespace at the beginning and end
	value = spans[1].text.strip()

	if column == Properties.REALNAME:
	value = value.replace("'", "\\'") # Escape single quotes
	elif column == Properties.CLASS:
	value = value.lower() # Our db enum is lowercase
	values = value.split(",") # Some vehicles have multiple classes for some reason
	value = values[0] # We only want the first one, which is the main class

	class_enum = {
	'boats' : 'boatsmilitary',
	'commercial' : 'commercialvans',
	'compacts' : None,
	'coupes' : None,
	'cycles' : None,
	'emergency' : ['emergencysuvs', 'emergencyvans', 'emergencymotorcycles', 'helicoptersemergency', 'boatsemergency'],
	'helicopters' : None,
	'industrial' : 'industrialoff-road',
	'military' : ['commercialmilitary', 'militarysuvs', 'remote controlmilitary', 'militaryutility'],
	'motorcycles' : ['motorcyclesoff-road', 'motorcyclesutility'],
	'muscle' : 'muscletuners',
	'off-road' : ['off-roadservice', 'off-roadvans', 'remote controloff-road', 'off-roadsports classics'],
	'planes' : None,
	'sedans' : 'sedanssports',
	'service' : None,
	'sports' : None,
	'sportsclassics': ['sports classics', 'sportssports classics'],
	'supers' : 'super',
	'suvs' : 'off-roadsuvs',
	'trailer' : None,
	'trains' : None,
	'utility' : None,
	'vans' : None,
	'sportstuners' : ['coupestuners', 'sedanstuners'],
	'offroad-suvs' : None,
	'openwheel' : 'open wheel'
	}

	for k, v in class_enum.items():
	if isinstance(v, list):
	if value in v:
	value = k
	break
	elif isinstance(v, str):
	if value == v:
	value = k
	break
	elif column == Properties.BRAND:
	if value == "Jacksheepe": # * This is the only exception in the entire brand enum
	value = "Jack Sheepe"
	elif column == Properties.PRICE:
	# Remove dollar sign and commas
	value = value.replace("$", "").replace(",", "")
	elif column == Properties.DLC:
	dlc_enum = { # * This is not critical, but would be nice to have
	'mpbeach' : None,
	'mpvalentines' : None,
	'mpbusiness' : None,
	'mpbusiness2' : None,
	'mphipster' : None,
	'mpindependence' : None,
	'spupgrade' : None,
	'mppilot' : None,
	'mplts' : None,
	'mpchristmas2' : None,
	'mpheist' : None,
	'mpluxe' : None,
	'mpluxe2' : None,
	'mplowrider' : None,
	'mphalloween' : None,
	'mpapartment' : None,
	'mpxmas_604490' : None,
	'mpjanuary2016' : None,
	'mpvalentines2' : None,
	'mplowrider2' : None,
	'mpexecutive' : None,
	'mpstunt' : None,
	'mpbiker' : None,
	'mpimportexport' : None,
	'mpspecialraces' : None,
	'mpgunrunning' : None,
	'mpsmuggler' : None,
	'mpchristmas2017' : None,
	'mpassault' : None,
	'mpbattle' : None,
	'mpchristmas2018' : None,
	'mpvinewood' : None,
	'mpheist3' : '1.21 Heists DLC', # ! Confirm this.
	'mpsum' : None,
	'mpheist4' : None,
	'mptuner' : None,
	'mpsecurity' : None,
	'mpg9ec' : None,
	'mpsum2' : None,
	'mpsum2_g9ec' : None,
	'mpchristmas3' : None,
	'mpchristmas3_g9ec': None
	}

	found = False

	for k, v in dlc_enum.items():
	if value == v:
	value = k
	found = True
	break

	if not found: value = None

	elif column == Properties.WEIGHT:
	# Remove the kg at the end
	value = value.replace("KG", "")
	# Remove comma
	value = value.replace(",", "")

	# Add the column and value if there is an actual value
	if column not in vehicle and not value == None: vehicle[column] = value.strip()

	# Print the vehicle data pretty
	for column, value in vehicle.items(): print(f"\033[1m{column}:\033[0m {value}")

	# Update vehicle using the model ID
	query = f"UPDATE vehicles_metadata SET "

	# Build up the query
	for column, value in vehicle.items():
	if column == Properties.MODEL: continue # Skip the model column. We'll use it in the WHERE clause
	query = f"{column} = '{value}', "

	if Properties.MODEL not in vehicle: # If we don't have a model, we can't update the vehicle
	print(vehicle)
	return

	# Split Properties.MODEL using a comma to check if we have multiple models
	models = vehicle[Properties.MODEL].split(",")
	# If we have multiple models, we need to use the IN operator
	if len(models) > 1:
	print(f"\033[33mVehicle {vehicle[Properties.MODEL]} has multiple models, using IN operator\033[0m")

	# Replace all commas with a single quote, a comma and another single quote
	vehicle[Properties.MODEL] = vehicle[Properties.MODEL].replace(", ", "','")

	query = query[:-2] f" WHERE model IN ('{vehicle[Properties.MODEL]}')"
	else:
	query = query[:-2] f" WHERE model = '{vehicle[Properties.MODEL]}'"

	print(f"Executing query: {query}")

	# Execute the query
	cursor.execute(query)
	# Check if the query was successful
	if cursor.rowcount == 0:
	print(f"\033[31mFailed to update vehicle {vehicle[Properties.MODEL]}\033[0m")
	else:
	print(f"\033[32mSuccessfully updated vehicle {vehicle[Properties.MODEL]}\033[0m")

	# Commit the changes to the database
	db.commit()

	return True

	if len(sys.argv) < 2: # No argument was passed so we load models from a JSON file called "gtabase_model_urls.json"
	with open("gtabase_model_urls.json", "r") as f:
	models = json.load(f)

	# Scrape each vehicle model
	current_model = 0
	for model in models:
	current_model = 1
	print(f"\nScraping vehicle {current_model}/{len(models)}\n")
	if ScrapeVehicle(model) == False: print(f"\033[31mFailed to scrape vehicle {model[0]}\033[0m")
	else:
	# Every argument is a vehicle url
	for url in sys.argv[1:]:
	ScrapeVehicle(url)

	# Close the database connection
	cursor.close()
	db.close()