Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for CryoSPARC #1416

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions config.tpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ log_analytics:
#name:
#subscription_id: # Optional, if not specified the current subscription will be used

# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy.
monitoring:
install_agent: true
# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy.
monitoring:
install_agent: true

#If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to.
alerting:
enabled: true
Expand All @@ -46,7 +46,7 @@ anf:
homefs_service_level: Standard
# dual protocol
dual_protocol: false # true to enable SMB support. false by default
# If alerting is enabled, this value will be used to determine when to trigger alerts
# If alerting is enabled, this value will be used to determine when to trigger alerts
alert_threshold: 80 # alert when ANF volume reaches this threshold

# For small deployments you can use Azure Files instead of ANF for the home directory
Expand All @@ -64,7 +64,7 @@ mounts:
export: '{{anf_home_path}}' # Specify an existing NFS export directory, when using the ANF built in use '{{anf_home_path}}'
options: '{{anf_home_opts}}' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev
# mount1:
# mountpoint: /mount1
# mountpoint: /mount1
# server: a.b.c.d # Specify an existing NFS server name or IP
# export: myexport1 # Specify an existing NFS export name
# options: my_options # Specify the mount options.
Expand All @@ -80,7 +80,7 @@ network:
vnet:
name: hpcvnet # Optional - default to hpcvnet
#id: # If a vnet id is set then no network will be created and the provided vnet will be used
address_space: "10.0.0.0/23"
address_space: "10.0.0.0/23"
# Special VNET Tags
# tags:
# key1: value1
Expand Down Expand Up @@ -143,7 +143,7 @@ network:
# asg-deployer: asg-deployer
# asg-guacamole: asg-guacamole
# asg-mariadb-client: asg-mariadb-client

# peering: # This list is optional, and can be used to create VNet Peerings in the same subscription.
# - vnet_name: #"VNET Name to Peer to"
# vnet_resource_group: #"Resource Group of the VNET to peer to"
Expand Down Expand Up @@ -413,7 +413,7 @@ queues:
ColocateNodes: false
# Specific idle time in seconds before shutting down VMs, make sure it's lower than autoscale.idle_timeout
idle_timeout: 300
# Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100;
# Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100;
# 100 is default value; lower numbers will improve scaling for single node jobs or jobs with small number of nodes
MaxScaleSetSize: 100
- name: hc44rs
Expand Down Expand Up @@ -498,3 +498,17 @@ applications:
enabled: false
bc_vizer:
enabled: false
cryosparc:
enabled: false
license_id: <LICENSE-ID>
admin_user: adminuser
master_vm_size: Standard_D8s_v5
master_vm_image: azhpc:azhop-compute:centos-7_9:latest
master_hostname: cryosparc-master
master_data_disk_size: 256
master_data_disk_type: Premium_LRS
target_queues:
- nc24v3
- hb120v3
- hc44rs

60 changes: 34 additions & 26 deletions deploy/purebicep/azhop.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ var createDatabase = (config.queue_manager == 'slurm' && config.slurm.accounting

var lustreOssCount = deployLustre ? azhopConfig.lustre.oss_count : 0

var ossVmConfig = [for oss in range(0, lustreOssCount) : {
var ossVmConfig = [for oss in range(0, lustreOssCount) : {
key: 'lustre-oss-${oss}'
value: {
identity: {
Expand Down Expand Up @@ -393,14 +393,16 @@ var config = {
MariaDB: ['3306', '33060']
Guacamole: ['8080']
WinRM: ['5985', '5986']
// Applications: CryoSPARC
Applications: ['39000']
}

nsg_rules: {
default: {
//
// INBOUND RULES
//

// AD communication
AllowAdServerTcpIn : ['220', 'Inbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'asg', 'asg-ad', 'asg', 'asg-ad-client']
AllowAdServerUdpIn : ['230', 'Inbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad', 'asg', 'asg-ad-client']
Expand All @@ -412,62 +414,65 @@ var config = {
AllowAdClientComputeUdpIn : ['290', 'Inbound', 'Allow', 'Udp', 'DomainControlerUdp', 'subnet', 'compute', 'asg', 'asg-ad']
AllowAdServerNetappTcpIn : ['300', 'Inbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'subnet', 'netapp', 'asg', 'asg-ad']
AllowAdServerNetappUdpIn : ['310', 'Inbound', 'Allow', 'Udp', 'DomainControlerUdp', 'subnet', 'netapp', 'asg', 'asg-ad']

// SSH internal rules
AllowSshFromJumpboxIn : ['320', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-jumpbox', 'asg', 'asg-ssh']
AllowSshFromComputeIn : ['330', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'asg', 'asg-ssh']
// Only in a deployer VM scenario
AllowSshFromDeployerIn : ['340', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'asg', 'asg-ssh']
AllowSshFromDeployerIn : ['340', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'asg', 'asg-ssh']
// Only in a deployer VM scenario
AllowDeployerToPackerSshIn : ['350', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'subnet', 'admin']
AllowSshToComputeIn : ['360', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-ssh', 'subnet', 'compute']
AllowSshComputeComputeIn : ['365', 'Inbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'subnet', 'compute']

// PBS
AllowPbsIn : ['369', 'Inbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs', 'asg', 'asg-pbs-client']
AllowPbsClientIn : ['370', 'Inbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs-client', 'asg', 'asg-pbs']
AllowPbsComputeIn : ['380', 'Inbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs', 'subnet', 'compute']
AllowComputePbsClientIn : ['390', 'Inbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs-client']
AllowComputePbsIn : ['400', 'Inbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs']
AllowComputeComputePbsIn : ['401', 'Inbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'subnet', 'compute']

// SLURM
AllowComputeSlurmIn : ['405', 'Inbound', 'Allow', '*', 'Slurmd', 'asg', 'asg-ondemand', 'subnet', 'compute']

// CycleCloud
AllowCycleWebIn : ['440', 'Inbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-ondemand', 'asg', 'asg-cyclecloud']
AllowCycleClientIn : ['450', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud-client', 'asg', 'asg-cyclecloud']
AllowCycleClientComputeIn : ['460', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'subnet', 'compute', 'asg', 'asg-cyclecloud']
AllowCycleServerIn : ['465', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud', 'asg', 'asg-cyclecloud-client']

// OnDemand NoVNC
AllowComputeNoVncIn : ['470', 'Inbound', 'Allow', 'Tcp', 'NoVnc', 'subnet', 'compute', 'asg', 'asg-ondemand']
AllowNoVncComputeIn : ['480', 'Inbound', 'Allow', 'Tcp', 'NoVnc', 'asg', 'asg-ondemand', 'subnet', 'compute']

// Telegraf / Grafana
AllowTelegrafIn : ['490', 'Inbound', 'Allow', 'Tcp', 'Telegraf', 'asg', 'asg-telegraf', 'asg', 'asg-grafana']
AllowComputeTelegrafIn : ['500', 'Inbound', 'Allow', 'Tcp', 'Telegraf', 'subnet', 'compute', 'asg', 'asg-grafana']
AllowGrafanaIn : ['510', 'Inbound', 'Allow', 'Tcp', 'Grafana', 'asg', 'asg-ondemand', 'asg', 'asg-grafana']

// Admin and Deployment
AllowWinRMIn : ['520', 'Inbound', 'Allow', 'Tcp', 'WinRM', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp']
AllowRdpIn : ['550', 'Inbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp']
AllowWebDeployerIn : ['595', 'Inbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-deployer', 'asg', 'asg-ondemand']

// Guacamole
AllowGuacamoleRdpIn : ['610', 'Inbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-guacamole', 'subnet', 'compute']

// MariaDB
AllowMariaDBIn : ['700', 'Inbound', 'Allow', 'Tcp', 'MariaDB', 'asg', 'asg-mariadb-client', 'subnet', 'admin']

// Cluster applications
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would try to make this generic.

AllowApplicationsIn : ['710', 'Inbound', 'Allow', 'All', 'Applications', 'asg', 'asg-ondemand', 'subnet', 'compute']

// Deny all remaining traffic
DenyVnetInbound : ['3100', 'Inbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork']


//
// Outbound
//

// AD communication
AllowAdClientTcpOut : ['200', 'Outbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'asg', 'asg-ad-client', 'asg', 'asg-ad']
AllowAdClientUdpOut : ['210', 'Outbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad-client', 'asg', 'asg-ad']
Expand All @@ -479,57 +484,60 @@ var config = {
AllowAdServerComputeUdpOut : ['270', 'Outbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad', 'subnet', 'compute']
AllowAdServerNetappTcpOut : ['280', 'Outbound', 'Allow', 'Tcp', 'DomainControlerTcp', 'asg', 'asg-ad', 'subnet', 'netapp']
AllowAdServerNetappUdpOut : ['290', 'Outbound', 'Allow', 'Udp', 'DomainControlerUdp', 'asg', 'asg-ad', 'subnet', 'netapp']

// CycleCloud
AllowCycleServerOut : ['300', 'Outbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud', 'asg', 'asg-cyclecloud-client']
AllowCycleClientOut : ['310', 'Outbound', 'Allow', 'Tcp', 'CycleCloud', 'asg', 'asg-cyclecloud-client', 'asg', 'asg-cyclecloud']
AllowComputeCycleClientIn : ['320', 'Outbound', 'Allow', 'Tcp', 'CycleCloud', 'subnet', 'compute', 'asg', 'asg-cyclecloud']
AllowCycleWebOut : ['330', 'Outbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-ondemand', 'asg', 'asg-cyclecloud']

// PBS
AllowPbsOut : ['340', 'Outbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs', 'asg', 'asg-pbs-client']
AllowPbsClientOut : ['350', 'Outbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs-client', 'asg', 'asg-pbs']
AllowPbsComputeOut : ['360', 'Outbound', 'Allow', '*', 'Pbs', 'asg', 'asg-pbs', 'subnet', 'compute']
AllowPbsClientComputeOut : ['370', 'Outbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs']
AllowComputePbsClientOut : ['380', 'Outbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'asg', 'asg-pbs-client']
AllowComputeComputePbsOut : ['381', 'Outbound', 'Allow', '*', 'Pbs', 'subnet', 'compute', 'subnet', 'compute']

// SLURM
AllowSlurmComputeOut : ['385', 'Outbound', 'Allow', '*', 'Slurmd', 'asg', 'asg-ondemand', 'subnet', 'compute']

// NFS
AllowNfsOut : ['440', 'Outbound', 'Allow', '*', 'Nfs', 'asg', 'asg-nfs-client', 'subnet', 'netapp']
AllowNfsComputeOut : ['450', 'Outbound', 'Allow', '*', 'Nfs', 'subnet', 'compute', 'subnet', 'netapp']

// Telegraf / Grafana
AllowTelegrafOut : ['460', 'Outbound', 'Allow', 'Tcp', 'Telegraf', 'asg', 'asg-telegraf', 'asg', 'asg-grafana']
AllowComputeTelegrafOut : ['470', 'Outbound', 'Allow', 'Tcp', 'Telegraf', 'subnet', 'compute', 'asg', 'asg-grafana']
AllowGrafanaOut : ['480', 'Outbound', 'Allow', 'Tcp', 'Grafana', 'asg', 'asg-ondemand', 'asg', 'asg-grafana']

// SSH internal rules
AllowSshFromJumpboxOut : ['490', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-jumpbox', 'asg', 'asg-ssh']
AllowSshComputeOut : ['500', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-ssh', 'subnet', 'compute']
AllowSshDeployerOut : ['510', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'asg', 'asg-ssh']
AllowSshDeployerPackerOut : ['520', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'asg', 'asg-deployer', 'subnet', 'admin']
AllowSshFromComputeOut : ['530', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'asg', 'asg-ssh']
AllowSshComputeComputeOut : ['540', 'Outbound', 'Allow', 'Tcp', 'Ssh', 'subnet', 'compute', 'subnet', 'compute']

// OnDemand NoVNC
AllowComputeNoVncOut : ['550', 'Outbound', 'Allow', 'Tcp', 'NoVnc', 'subnet', 'compute', 'asg', 'asg-ondemand']
AllowNoVncComputeOut : ['560', 'Outbound', 'Allow', 'Tcp', 'NoVnc', 'asg', 'asg-ondemand', 'subnet', 'compute']

// Admin and Deployment
AllowRdpOut : ['570', 'Outbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp']
AllowWinRMOut : ['580', 'Outbound', 'Allow', 'Tcp', 'WinRM', 'asg', 'asg-jumpbox', 'asg', 'asg-rdp']
AllowDnsOut : ['590', 'Outbound', 'Allow', '*', 'Dns', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork']
AllowWebDeployerOut : ['595', 'Outbound', 'Allow', 'Tcp', 'Web', 'asg', 'asg-deployer', 'asg', 'asg-ondemand']

// Guacamole
AllowGuacamoleRdpOut : ['610', 'Outbound', 'Allow', 'Tcp', 'Rdp', 'asg', 'asg-guacamole', 'subnet', 'compute']

// MariaDB
AllowMariaDBOut : ['700', 'Outbound', 'Allow', 'Tcp', 'MariaDB', 'asg', 'asg-mariadb-client', 'subnet', 'admin']


// Cluster applications
AllowApplicationsOut : ['710', 'Outbound', 'Allow', 'All', 'Applications', 'asg', 'asg-ondemand', 'subnet', 'compute']

// Deny all remaining traffic and allow Internet access
AllowInternetOutBound : ['3000', 'Outbound', 'Allow', 'Tcp', 'All', 'tag', 'VirtualNetwork', 'tag', 'Internet']
DenyVnetOutbound : ['3100', 'Outbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork']
Expand Down
7 changes: 4 additions & 3 deletions playbooks/cccluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@
include_vars:
file: '{{lookup_img_file}}'

- include_role:
- include_role:
name: cyclecloud_cluster
apply:
apply:
become: true
vars:
cc_region: '{{location}}'
Expand All @@ -108,7 +108,7 @@
cc_domain: '{{domain_name}}'
cc_queue_manager: '{{ queue_manager | default("openpbs") }}'
influxdb_database_name: "telegraf"
telegraf_influxdb_urls:
telegraf_influxdb_urls:
- "http://grafana:8086"
cc_slurm_version: '{{slurm.slurm_version | default("20.11.9")}}-1'
slurm_uid: 11100
Expand All @@ -119,6 +119,7 @@
enroot_scratch_dir: '/mnt/resource'
cvmfs_eessi_enabled: '{{cvmfs_eessi.enabled | default(false)}}'
cc_enable_remote_winviz: '{{enable_remote_winviz | default(false)}}'
cryosparc_enabled: '{{applications.cryosparc.enabled | default(false)}}'

# Generate the node array core lookup file for ondemand - will be only run if the marker file for ondemand exists
- import_tasks: nodearray_lookup.yml
Expand Down
8 changes: 7 additions & 1 deletion playbooks/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
line: 'AllowTcpForwarding yes'
- name: restart sshd
service:
name: sshd
name: sshd
state: restarted
- name: update packages for security
become: true
Expand Down Expand Up @@ -71,3 +71,9 @@
mode: '0755'
run_once : true

- name: Create {{homedir_mountpoint}}/apps directory
file:
path: '{{homedir_mountpoint}}/apps'
state: directory
mode: '0755'
run_once : true
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[Desktop Entry]
Type=Link
Version=1.0
Name=CryoSPARC
Icon=/usr/share/icons/hicolor/16x16/apps/cryosparc.png
URL=http://cryosparc-master:39000/
Name[en_US.UTF-8]=CryoSPARC
Categories=Education
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "PARTITION",
"worker_bin_path": "/anfhome/apps/cryosparc/cryosparc_worker/bin/cryosparcw",
"cache_path": "/mnt/resource",
"send_cmd_tpl": "{{ command }}",
"qsub_cmd_tpl": "sbatch {{ script_path_abs }}",
"qstat_cmd_tpl": "squeue -j {{ cluster_job_id }}",
"qdel_cmd_tpl": "scancel {{ cluster_job_id }}",
"qinfo_cmd_tpl": "sinfo",
"transfer_cmd_tpl": "scp {{ src_path }} loginnode:{{ dest_path }}"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
#SBATCH --partition=PARTITION
#SBATCH --nodes=1
#SBATCH --ntasks-per-node={{ num_cpu }}
#SBATCH --cpus-per-task=1
#SBATCH --threads-per-core=1
#SBATCH --gres=gpu:{{ num_gpu }}
#SBATCH --mem={{ (ram_gb*1000)|int }}MB
#SBATCH --job-name cryosparc_{{ project_uid }}_{{ job_uid }}
#SBATCH --output={{ job_log_path_abs }}
#SBATCH --error={{ job_log_path_abs }}

{{ run_cmd }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
set -e

# Run only on CryoSPARC master node
[ $(hostname) != {{ applications.cryosparc.master_hostname }} ] && exit 0

parted /dev/sdb mktable gpt
parted /dev/sdb mkpart primary ext4 0% 100%
mkfs.ext4 /dev/sdb1
DEV_UUID=$(blkid -s UUID -o value /dev/sdb1)
printf 'UUID=%s /cryosparc_data ext4 defaults 0 0\n' $DEV_UUID >> /etc/fstab
mkdir /cryosparc_data
mount -a
chown {{ applications.cryosparc.admin_user }}: /cryosparc_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -e

# Run only on CryoSPARC master node
[ $(hostname) != {{ applications.cryosparc.master_hostname }} ] && exit 0

INSTALL_DIR=/anfhome/apps/cryosparc
SOURCES_DIR=${INSTALL_DIR}/sources

mkdir -p ${SOURCES_DIR}
cd ${SOURCES_DIR}

for COMPONENT in master worker; do
if [ -s ${SOURCES_DIR}/cryosparc_${COMPONENT}.tar.gz ]; then
echo "cryosparc_${COMPONENT}.tar.gz already downloaded"
else
echo "Downloading cryosparc_${COMPONENT}.tar.gz"
curl -L https://get.cryosparc.com/download/${COMPONENT}-latest/{{ applications.cryosparc.license_id }} -o cryosparc_${COMPONENT}.tar.gz
fi
done

chown -R {{ applications.cryosparc.admin_user }}: ${INSTALL_DIR}
Loading