flsim/optimizers/async_aggregators.py (3 lines):
	- line 75: TODO: add adaptive learning rate based on staleness of gradient
	- line 226: # TODO: for optimizers that don't use momentum or adaptive learning rate,
	- line 313: # TODO: better if this assert fires at config creation time


flsim/utils/distributed/fl_distributed.py (2 lines):
	- line 249: # TODO: enable all_reduce on mixed dtypes with dtype-based bucketing
	- line 325: # TODO: (jesikmin) T55869097 Check whether the size of buffer is same as


flsim/trainers/sync_trainer.py (2 lines):
	- line 157: TODO correct note if above option added.
	- line 162: )  # TODO do not call distributed utils here, this is upstream responsibility


flsim/reducers/base_round_reducer.py (2 lines):
	- line 153: # TODO these are specific to mean reducer [this implementation]
	- line 243: # TODO num_samples is used as the default weight, this needs revisit


flsim/clients/base_client.py (2 lines):
	- line 247: # TODO use an independent random generator
	- line 321: # TODO MM make sure metric reporter is multi-process safe.


flsim/trainers/async_trainer.py (1 line):
	- line 287: # TODO: async_user_selector_type should be directly instantiable from json_config


flsim/reducers/secure_round_reducer.py (1 line):
	- line 104: # TODO num_samples is used as the default weight, this needs revisit


flsim/utils/data/fake_data_utils.py (1 line):
	- line 56: # TODO add flag for a final batch being incomplete


flsim/secure_aggregation/secure_aggregator.py (1 line):
	- line 306: overflow_matrix = torch.div(  # FIXME: div blows up when MAX_WIDTH_BYTES >7


flsim/utils/fl/common.py (1 line):
	- line 170: TODO If needed we can also add device here.


flsim/interfaces/metrics_reporter.py (1 line):
	- line 169: # TODO: is this needed? Do we ever call this externally?