Coverage for melissa/launcher/schema.py: 56%

1import rapidjson

2from typing import Dict, Any, Tuple

3import argparse

4from jsonschema import Draft4Validator, validators

5from jsonschema.exceptions import ValidationError

6import logging

7import sys

9logger = logging.getLogger(__name__)

11CONFIG_PARSE_MODE = rapidjson.PM_COMMENTS | rapidjson.PM_TRAILING_COMMAS

14CONF_SCHEMA = {

15 'type': 'object',

16 'properties': {

17 "server_filename": {"type": "string", "required": True, "message": "The name of the file containing the user defined server. Assumed to be in the same folder as the config."},

18 "server_class": {"type": "string", "required": True, "message": "The class name of the user defined server inside the server_filename file."},

19 "output_dir": {"type": "string", "required": True, "message": "The output dir to write results and logs. If relative path, then it is assumed relative to the CWD of the melissa-launcher command."},

20 "study_options": {

21 "type": "object",

22 "message": "A custom dictionary which is accessible inside the server_class for users to parameterize their studies.",

23 "properties": {

24 "parameter_sweep_size": {"type": "integer", "message": "The number of clients to launch (or groups if using sobol indices)."},

25 "num_samples": {"type": "integer", "default": 0, "message": "Number of samples expected to arrive from each client. When not given, it can be inferred by Melissa (DL server only)."},

26 "verbosity": {"type": "integer", "default": 0, "message": "Set the logger verbosity. 3 includes all levels (including info, error, warning, and debug), 0 reduces to logging to minimum (error only)."}

27 }

28 },

29 "dl_config": {

30 "type": "object",

31 "properties": {

32 "simulation_timeout": {"type": "integer", "default": 400, "message": "Seconds of client inactivity between two messages before timing out the client."},

33 "batch_size": {"type": "integer", "default": 10, "message": "Number of samples to build each batch."},

34 "n_batches_update": {"type": "integer", "default": 10, "message": "Number of batches between validation checks and loss logging."},

35 "buffer_size": {"type": "integer", "default": 10000, "message": "Maximum number of samples to store in the buffer (object used to generate batches for training)."},

36 "per_server_watermark": {"type": "integer", "message": "Required number of samples in each server process buffer before batch creation and training can begin."},

37 "tensorboard": {"type": "boolean", "default": True, "message": "Set to False to disable tensorboard logger entirely for production level runs where you do not wish to log metrics"},

38 "get_buffer_statistics": {"type": "boolean", "default": False, "message": "Estimate buffer statistics each time a batch is generated and add to the tensorboard log. Requires custom server imlementation of `get_buffer_statistics()`."},

39 },

40 "message": "A custom dictionary which is accessible inside the server_class for users to customize their training loops and buffers."},

41 "sa_config": {

42 "type": "object",

43 "properties": {

44 "mean": {"type": "boolean", "default": True, "message": "Collect mean for all fields."},

45 "variance": {"type": "boolean", "default": False, "message": "Collect variance for all fields."},

46 "skewness": {"type": "boolean", "default": False, "message": "Collect skewness for all fields."},

47 "kurtosis": {"type": "boolean", "default": False, "message": "Collect kurtosis for all fields."},

48 "checkpoint_interval": {"type": "integer", "default": 0, "message": "Checkpoint frequency for the sensitivity analysis. Number of samples between each checkpoint."},

49 "sobol_indices": {"type": "boolean", "default": False, "message": "Activate sobol indicies. Group count determined by study_options.parameter_sweep_size"},

50 },

51 "message": "A dictionary used to control the sensitivity analysis servers."

52 },

53 "server_config": {

54 "type": "object",

55 "default": {"preprocessing_commands": []},

56 "properties": {

57 "preprocessing_commands": {"type": "array", "default": [], "message": "Commands that will be preprocessed by bash prior to launching the server job."},

58 "melissa_server_env": {"type": "string", "message": "Explicit path to the server installation. Typically does not need to be touched unless two different melissa installations are used."}

59 },

60 "message": "Special configuration for the server only.",

61 },

62 "client_config": {

63 "type": "object",

64 "properties": {

65 "preprocessing_commands": {"type": "array", "default": [], "message": "Commands that will be preprocessed by bash prior to launching the client job."},

66 "melissa_client_env": {"type": "string", "message": "Explicit path to find the client installation. Typically does not need to be touched unless two different melissa installations are used."}

67 },

68 "message": "Special configuration for the client only."},

69 "launcher_config": {

70 "type": "object",

71 "properties": {

72 "scheduler": {"type": "string", "required": True, "message": "Select scheduler, can be 'oar', 'slurm', 'openmpi'"},

73 "server_executable": {"type": "string", "default": "server.sh", "message": "Experienced users only, used to modify the bash template."},

74 "bind": {"type": "string", "default": "0.0.0.0", "message": "Address to bind the REST API."},

75 "http_port": {"type": "integer", "default": 8888, "message": "Port to put the REST API."},

76 "http_token": {"type": "string", "default": "", "message": "Token used to access REST API, leave empty to let Melissa generate a unique secure token on launch."},

77 "fault_tolerance": {"type": "boolean", "default": True, "message": "Activate/deactivate fault tolerance."},

78 "protocol": {"type": "string", "default": "auto", "message": "Experienced users only, Melissa determines best protocol automatically."},

79 "std_output": {"type": "boolean", "default": True, "message": "Keep or delete the std out/err files from all jobs."},

80 "scheduler_arg": {"type": "array", "default": [], "message": "Common arguments to pass to scheduler for both client and server."},

81 "scheduler_arg_client": {"type": "array", "default": [], "message": "Arguments to pass to scheduler for client only."},

82 "scheduler_arg_server": {"type": "array", "default": [], "message": "Arguments to pass to scheduler for server only."},

83 "scheduler_server_command": {"type": "string", "message": "Option to change the execution command (e.g. in place of srun or mpirun)"},

84 "scheduler_client_command": {"type": "string", "message": "Option to change the execution command (e.g. in place of srun or mpirun)"},

85 "scheduler_server_command_options": {"type": "array", "default": [], "message": "Options to pass to the scheduler inside the client execution command. Example: ['mpi=pmi2'] which, with slurm, would yield an sbatch.X.sh file with srun mpi=pmi2 <other arguments>."},

86 "scheduler_client_command_options": {"type": "array", "default": [], "message": "Options to pass to the scheduler inside the server execution command. Example: ['mpi=pmi2'] which, with slurm, would yield an sbatch.X.sh file with srun mpi=pmi2 <other arguments>."},

87 "scheduler_arg_container": {"type": "array", "default": [], "message": "Arguments to pass to containers (e.g. oar-hybrid)."},

88 "container_client_size": {"type": "integer", "default": 1, "message": "Size of the container."},

89 "job_limit": {"type": "integer", "default": 1000, "message": "Maximum number of active jobs allowed."},

90 "besteffort_allocation_frequency": {"type": "integer", "default": 1, "message": "The frequency of job submission to submit to best-effort queue."},

91 "timer_delay": {"type": "integer", "message": "The minimal delay between two job status updates with the same value."},

92 "server_timeout": {"type": "integer", "message": "Maximum amount of seconds which defines a server timeout exit."},

93 "load_from_checkpoint": {"type": "boolean", "default": False, "message": "Look for checkpoint files to start the server from."},

94 "verbosity": {"type": "integer", "default": 0, "message": "Set the logger verbosity. 3 includes all levels (including info, error, warning, and debug), 0 reduces to logging to minimum (error only)."}

95 },

96 }

97 }

98}

100

101def _extend_validator(validator_class):

102 """

103 Extended validator for Melissa

104 """

105 validate_properties = validator_class.VALIDATORS['properties']

106

107 def set_defaults(validator, properties, instance, schema):

108 for prop, subschema in properties.items():

109 if 'default' in subschema:

110 instance.setdefault(prop, subschema['default'])

111

112 for error in validate_properties(

113 validator, properties, instance, schema,

114 ):

115 yield error

116

117 return validators.extend(

118 validator_class, {'properties': set_defaults}

119 )

120

121

122def validate_config(args: argparse.Namespace,

123 config: Dict[str, Any]) -> Tuple[argparse.Namespace,

124 Dict[str, Any]]:

125

126 MelissaValidator = _extend_validator(Draft4Validator)

127 try:

128 MelissaValidator(CONF_SCHEMA).validate(config)

129 except ValidationError as e:

130 logger.critical(

131 f"Invalid configuration. Reason: {e}"

132 )

133

134 return args, config

135

136

137class bcolors:

138 OKBLUE = '\033[32m'

139 OKGREEN = '\033[91m'

140 ENDC = '\033[0m'

141 UNDERLINE = '\033[4m'

142

143

144def print_options():

145

146 print(f"{bcolors.UNDERLINE}Available config options{bcolors.ENDC}\n")

147 for config in CONF_SCHEMA["properties"]:

148 top_dict = CONF_SCHEMA["properties"][config]

149 type = top_dict['type']

150 try:

151 message = top_dict["message"]

152 except KeyError:

153 message = ""

154 print(f"{bcolors.OKBLUE}{config}{bcolors.ENDC}: {message} "

155 f"Type {bcolors.UNDERLINE}{type}{bcolors.ENDC}.")

156

157 if "properties" in CONF_SCHEMA["properties"][config]:

158 conf_dict = CONF_SCHEMA["properties"][config]

159

160 for property in conf_dict["properties"]:

161 message = conf_dict['properties'][property]['message']

162 type = conf_dict['properties'][property]['type']

163 try:

164 default = conf_dict['properties'][property]['default']

165 except KeyError:

166 default = "N/A"

167 print(f" {bcolors.OKGREEN}{property}{bcolors.ENDC}: {message} "

168 f"Default value {default}, Type {bcolors.UNDERLINE}{type}{bcolors.ENDC}.")

169 sys.exit()