Skip to content

Commit

Permalink
Misc. CI fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
abejgonzalez committed Dec 6, 2022
1 parent 08f9a35 commit 273f6b2
Show file tree
Hide file tree
Showing 20 changed files with 386 additions and 234 deletions.
88 changes: 58 additions & 30 deletions .github/scripts/ci_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,59 3,87 @@
# This package contains utilities that rely on environment variable
# definitions present only on the CI container instance.

# Create a env. dict that is populated from the environment or from defaults
ci_env = {}

# If not running under a CI pipeline defaults are provided that
# will suffice to run scripts that do not use GHA API calls.
# To manually provide environment variable settings, export GITHUB_ACTIONS=true, and provide
# values for all of the environment variables below.
RUN_LOCAL = not os.environ.get('GITHUB_ACTIONS', False)
RUN_AZURE_CREDITED_ENV = bool(os.environ.get('AZURE_CREDITED_ENV', False))
ci_env['GITHUB_ACTIONS'] = os.environ.get('GITHUB_ACTIONS', "false")
RUN_LOCAL = ci_env['GITHUB_ACTIONS'] == 'false'
# When running locally (not in a CI pipeline) run commands out of the clone hosting this file.
local_fsim_dir = os.path.normpath((os.path.realpath(__file__)) "/../../..")

# Add list of env. variables to the ci_env dict
def add_env_vars(env_vars, default_value = ""):
for k, v in os.environ.items():
if k in env_vars:
ci_env[k] = v if not RUN_LOCAL else default_value

# CI instance environment variables

# This is used as a unique tag for all instances launched in a workflow
ci_workflow_run_id = os.environ['GITHUB_RUN_ID'] if not RUN_LOCAL else 0
ci_commit_sha1 = os.environ['GITHUB_SHA'] if not RUN_LOCAL else 0
gh_env_vars = {
# This is used as a unique tag for all instances launched in a workflow
'GITHUB_RUN_ID',
'GITHUB_SHA',
}
add_env_vars(gh_env_vars, 0)

# Multiple clones of the FireSim repository exists on manager. We expect state
# to persist between jobs in a workflow and faciliate that by having jobs run
# out of a centralized clone (ci_firesim_dir)-- not the default clones setup by
# the GHA runners (ci_workdir)
# out of a centralized clone (MANAGER_FIRESIM_LOCATION)-- not the default clones setup by
# the GHA runners (GITHUB_WORKSPACE)

# This is the location of the clone setup by the GHA runner infrastructure by default
# expanduser to replace the ~ present in the default, for portability
ci_workdir = os.path.expanduser(os.environ['GITHUB_WORKSPACE']) if not RUN_LOCAL else local_fsim_dir
ci_env['GITHUB_WORKSPACE'] = os.path.expanduser(os.environ['GITHUB_WORKSPACE']) if not RUN_LOCAL else local_fsim_dir

# This is the location of the reused clone. CI scripts should refer variables
# derived from this path so that they may be reused across workflows that may
# initialize the FireSim repository differently (e.g., as a submodule of a
# larger project.)
ci_firesim_dir = os.path.expanduser(os.environ['MANAGER_FIRESIM_LOCATION']) if not RUN_LOCAL else local_fsim_dir
ci_env['MANAGER_FIRESIM_LOCATION'] = os.path.expanduser(os.environ['MANAGER_FIRESIM_LOCATION']) if not RUN_LOCAL else local_fsim_dir

ci_api_token = os.environ['GITHUB_TOKEN'] if not RUN_LOCAL else 0
ci_personal_api_token = os.environ['PERSONAL_ACCESS_TOKEN'] if not RUN_LOCAL else 0
gh_env_vars = {
'GITHUB_TOKEN',
'PERSONAL_ACCESS_TOKEN',
}
add_env_vars(gh_env_vars, 0)

ci_gha_api_url = os.environ['GITHUB_API_URL'] if not RUN_LOCAL else ""
# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""
gh_env_vars = {
'GITHUB_API_URL',
# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
'GITHUB_REPOSITORY',
'GITHUB_EVENT_PATH',
}
add_env_vars(gh_env_vars)

# The following are environment variables used by AWS and AZURE to setup the corresponding
# self-hosted Github Actions Runners

ci_aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] if not RUN_LOCAL else ""
ci_aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] if not RUN_LOCAL else ""
ci_aws_default_region = os.environ['AWS_DEFAULT_REGION'] if not RUN_LOCAL else ""

ci_azure_client_id = os.environ['AZURE_CLIENT_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_client_secret = os.environ['AZURE_CLIENT_SECRET'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_tenant_id = os.environ['AZURE_TENANT_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_sub_id = os.environ['AZURE_SUBSCRIPTION_ID'] if (not RUN_LOCAL) or RUN_AZURE_CREDITED_ENV else ""
ci_azure_default_region = os.environ['AZURE_DEFAULT_REGION'] if not RUN_LOCAL else ""
ci_azure_resource_group = os.environ['AZURE_RESOURCE_GROUP'] if not RUN_LOCAL else ""
ci_azure_subnet_id = os.environ['AZURE_CI_SUBNET_ID'] if not RUN_LOCAL else ""
ci_azure_nsg_id = os.environ['AZURE_CI_NSG_ID'] if not RUN_LOCAL else ""

ci_firesim_pem = os.environ['FIRESIM_PEM'] if not RUN_LOCAL else ""
ci_firesim_pem_public = os.environ['FIRESIM_PEM_PUBLIC'] if not RUN_LOCAL else ""
aws_env_vars = {
'AWS_ACCESS_KEY_ID',
'AWS_SECRET_ACCESS_KEY',
'AWS_DEFAULT_REGION',
}
add_env_vars(aws_env_vars)

azure_env_vars = {
'AZURE_CLIENT_ID',
'AZURE_CLIENT_SECRET',
'AZURE_TENANT_ID',
'AZURE_SUBSCRIPTION_ID',
'AZURE_DEFAULT_REGION',
'AZURE_RESOURCE_GROUP',
'AZURE_CI_SUBNET_ID',
'AZURE_CI_NSG_ID',
}
add_env_vars(azure_env_vars)

pem_env_vars = {
'FIRESIM_PEM',
'FIRESIM_PEM_PUBLIC',
}
add_env_vars(pem_env_vars)
54 changes: 5 additions & 49 deletions .github/scripts/common.py
Original file line number Diff line number Diff line change
@@ -1,21 1,16 @@
import math
from fabric.api import *
import requests
from ci_variables import ci_gha_api_url, ci_repo_name, ci_firesim_dir

from typing import Dict, List, Any
from typing import Dict

from platform_lib import PlatformLib, AWSPlatformLib, AzurePlatformLib, Platform

# Github URL related constants
gha_api_url = f"{ci_gha_api_url}/repos/{ci_repo_name}/actions"
gha_runners_api_url = f"{gha_api_url}/runners"
gha_runs_api_url = f"{gha_api_url}/runs"
from ci_variables import ci_env
from github_common import deregister_runners

# Remote paths
manager_home_dir = "/home/centos"
manager_fsim_pem = manager_home_dir "/firesim.pem"
manager_fsim_dir = ci_firesim_dir
manager_fsim_dir = ci_env['MANAGER_FIRESIM_LOCATION']
manager_marshal_dir = manager_fsim_dir "/sw/firesim-software"
manager_ci_dir = manager_fsim_dir "/.github/scripts"

Expand All @@ -27,48 22,9 @@
env.disable_known_hosts = True
env.keepalive = 60 # keep long SSH connections running

def set_fabric_firesim_pem():
def set_fabric_firesim_pem() -> None:
env.key_filename = manager_fsim_pem

def get_header(gh_token: str) -> Dict[str, str]:
return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github json"}

def get_runners(gh_token: str) -> List:
r = requests.get(gha_runners_api_url, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve count of GitHub Actions Runners\nFull Response Below:\n{r}")
res_dict = r.json()
runner_count = res_dict["total_count"]

runners = []
for page_idx in range(math.ceil(runner_count / 30)):
r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx 1}, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve (sub)list of GitHub Actions Runners\nFull Response Below\n{r}")
res_dict = r.json()
runners = runners res_dict["runners"]

return runners

def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool:
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token))
if r.status_code != 204:
print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}\nFull Response Below\n{r}""")
return False
return True

def deregister_offline_runners(gh_token: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner["status"] == "offline":
delete_runner(gh_token, runner)

def deregister_runners(gh_token: str, runner_name: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner_name in runner["name"]:
delete_runner(gh_token, runner)

aws_platform_lib = AWSPlatformLib(deregister_runners)
#azure_platform_lib = AzurePlatformLib(deregister_runners)

Expand Down
39 changes: 28 additions & 11 deletions .github/scripts/cull-old-ci-instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 9,21 @@
import pytz
import boto3
import sys

from platform_lib import Platform
from common import deregister_runners, get_platform_lib
from common import get_platform_lib
from github_common import deregister_runners

# Reuse manager utilities
from ci_variables import ci_workdir, ci_personal_api_token, ci_workflow_run_id, ci_azure_sub_id
sys.path.append(ci_workdir "/deploy")
from ci_variables import ci_env
sys.path.append(ci_env['GITHUB_WORKSPACE'] "/deploy")

# The number of hours an instance may exist since its initial launch time
# The number of hours a manager instance may exist since its initial launch time
INSTANCE_LIFETIME_LIMIT_HOURS = 8
# The number of hours a fpga instance may exist since its initial launch time
FPGA_INSTANCE_LIFETIME_LIMIT_HOURS = 1

def find_timed_out_resources(current_time: DateTime, resource_list: Iterable[Tuple]) -> list:
def find_timed_out_resources(hr_limit: int, current_time: DateTime, resource_list: Iterable[Tuple]) -> list:
"""
Because of the differences in how AWS and Azure store time tags, the resource_list
in this case is a list of tuples with the 0 index being the instance/vm and the 1 index
Expand All @@ -28,37 32,50 @@ def find_timed_out_resources(current_time: DateTime, resource_list: Iterable[Tup
timed_out = []
for resource_tuple in resource_list:
lifetime_secs = (current_time - resource_tuple[1]).total_seconds()
if lifetime_secs > (INSTANCE_LIFETIME_LIMIT_HOURS * 3600):
if lifetime_secs > (hr_limit * 3600):
timed_out.append(resource_tuple[0])
return timed_out

def cull_aws_instances(current_time: DateTime) -> None:
# Grab all instances with a CI-generated tag
aws_platform_lib = get_platform_lib(Platform.AWS)
all_ci_instances = aws_platform_lib.find_all_ci_instances()
select_ci_instances = aws_platform_lib.find_select_ci_instances()

client = boto3.client('ec2')

instances_to_terminate = find_timed_out_resources(current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), select_ci_instances))
instances_to_terminate = find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
instances_to_terminate = list(set(instances_to_terminate))

print("Terminated Instances:")
for inst in instances_to_terminate:
deregister_runners(ci_personal_api_token, f"aws-{ci_workflow_run_id}")
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"aws-{ci_env['GITHUB_RUN_ID']}")
client.terminate_instances(InstanceIds=[inst['InstanceId']])
print(" " inst['InstanceId'])

if len(instances_to_terminate > 0):
exit(1)

def cull_azure_resources(current_time: DateTime) -> None:
azure_platform_lib = get_platform_lib(Platform.AZURE)
all_azure_ci_vms = azure_platform_lib.find_all_ci_instances()
select_azure_ci_vms = azure_platform_lib.find_select_ci_instances()

vms_to_terminate = find_timed_out_resources(current_time, \
vms_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), select_azure_ci_vms))
vms_to_terminate = find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS, current_time, \
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), all_azure_ci_vms))
vms_to_terminate = list(set(vms_to_terminate))

print("VMs:")
print("Terminated VMs:")
for vm in vms_to_terminate:
deregister_runners(ci_personal_api_token, f"azure-{ci_workflow_run_id}")
deregister_runners(ci_env['PERSONAL_ACCESS_TOKEN'], f"azure-{ci_env['GITHUB_RUN_ID']}")
azure_platform_lib.terminate_azure_vms([vm]) #prints are handled in here

if len(vms_to_terminate > 0):
exit(1)

def main():
# Get a timezone-aware datetime instance
current_time = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
Expand Down
6 changes: 3 additions & 3 deletions .github/scripts/cull-old-ci-runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 3,14 @@
# Runs periodically in it's own workflow in the CI/CD environment to teardown
# runners that are offline

from common import deregister_offline_runners
from github_common import deregister_offline_runners

# Reuse manager utilities
from ci_variables import ci_personal_api_token
from ci_variables import ci_env

def main():
# deregister all offline runners
deregister_offline_runners(ci_personal_api_token)
deregister_offline_runners(ci_env['PERSONAL_ACCESS_TOKEN'])

if __name__ == "__main__":
main()
64 changes: 64 additions & 0 deletions .github/scripts/github_common.py
Original file line number Diff line number Diff line change
@@ -0,0 1,64 @@
import math
from fabric.api import *
import requests
from ci_variables import ci_env
import json

from typing import Dict, List, Any

# Github URL related constants
gh_repo_api_url = f"{ci_env['GITHUB_API_URL']}/repos/{ci_env['GITHUB_REPOSITORY']}"
gh_issues_api_url = f"{gh_repo_api_url}/issues"
gha_api_url = f"{gh_repo_api_url}/actions"
gha_runners_api_url = f"{gha_api_url}/runners"
gha_runs_api_url = f"{gha_api_url}/runs"
gha_workflow_api_url = f"{gha_runs_api_url}/{ci_env['GITHUB_RUN_ID']}"

def get_header(gh_token: str) -> Dict[str, str]:
return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github json"}

def get_runners(gh_token: str) -> List:
r = requests.get(gha_runners_api_url, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve count of GitHub Actions Runners\nFull Response Below:\n{r}")
res_dict = r.json()
runner_count = res_dict["total_count"]

runners = []
for page_idx in range(math.ceil(runner_count / 30)):
r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx 1}, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception(f"Unable to retrieve (sub)list of GitHub Actions Runners\nFull Response Below\n{r}")
res_dict = r.json()
runners = runners res_dict["runners"]

return runners

def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool:
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token))
if r.status_code != 204:
print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}\nFull Response Below\n{r}""")
return False
return True

def deregister_offline_runners(gh_token: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner["status"] == "offline":
delete_runner(gh_token, runner)

def deregister_runners(gh_token: str, runner_name: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner_name in runner["name"]:
delete_runner(gh_token, runner)

def issue_post(gh_token: str, body: str) -> None:
with open(ci_env['GITHUB_EVENT_PATH']) as f:
event_payload = json.load(f)
gh_issue_id = event_payload["number"]

res = requests.post(f"{gh_issues_api_url}/{gh_issue_id}/comments",
json={"body": body}, headers=get_header(gh_token))
if res.status_code != 201:
raise Exception(f"HTTP POST error: {res} {res.json()}\nUnable to post GitHub PR comment.")
4 changes: 2 additions & 2 deletions .github/scripts/initialize-repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 4,15 @@

from common import manager_home_dir, manager_fsim_dir, manager_marshal_dir, set_fabric_firesim_pem
# This is expected to be launch from the ci container
from ci_variables import ci_workdir
from ci_variables import ci_env

def initialize_repo():
""" Initializes firesim repo: clones, runs build-setup, and intializes marshal submodules """

with cd(manager_home_dir):
run("rm -rf {}".format(manager_fsim_dir))
# copy ci version of the repo into the new globally accessible location
run("git clone {} {}".format(ci_workdir, manager_fsim_dir))
run("git clone {} {}".format(ci_env['GITHUB_WORKSPACE'], manager_fsim_dir))

with cd(manager_fsim_dir):
run("./build-setup.sh --skip-validate")
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/install-firesim-pem.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 3,7 @@
from fabric.api import *
import os

from ci_variables import ci_firesim_pem
from ci_variables import ci_env
from common import manager_home_dir, manager_fsim_pem, set_fabric_firesim_pem

def install_firesim_pem():
Expand All @@ -12,7 12,7 @@ def install_firesim_pem():
with cd(manager_home_dir):
# add firesim.pem
with open(manager_fsim_pem, "w") as pem_file:
pem_file.write(ci_firesim_pem)
pem_file.write(ci_env['FIRESIM_PEM'])
local("chmod 600 {}".format(manager_fsim_pem))

if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 273f6b2

Please sign in to comment.