from fastai.tabular.all import *
from ipfspy.ipfshttpapi import IPFSApi
Using IPFS in ML with ipfspy.ipfshttpapi
A Tutorial using IPFS and ML (using fastai)
In this tutorial, we will see how we can use IPFS as the storage for datasets and model in ML workflow.
Add a dataset to IPFS using infura node
= IPFSApi() api
Changed to local node
= 'infura' api.change_gateway_type
Changed to infura node
= api.add_items('output/adult_data.csv'); obj res, obj
[{'Name': 'adult_data.csv', 'Bytes': 262144},
{'Name': 'adult_data.csv', 'Bytes': 524288},
{'Name': 'adult_data.csv', 'Bytes': 786432},
{'Name': 'adult_data.csv', 'Bytes': 1048576},
{'Name': 'adult_data.csv', 'Bytes': 1310720},
{'Name': 'adult_data.csv', 'Bytes': 1572864},
{'Name': 'adult_data.csv', 'Bytes': 1835008},
{'Name': 'adult_data.csv', 'Bytes': 2097152},
{'Name': 'adult_data.csv', 'Bytes': 2359296},
{'Name': 'adult_data.csv', 'Bytes': 2621440},
{'Name': 'adult_data.csv', 'Bytes': 2883584},
{'Name': 'adult_data.csv', 'Bytes': 3145728},
{'Name': 'adult_data.csv', 'Bytes': 3407872},
{'Name': 'adult_data.csv', 'Bytes': 3670016},
{'Name': 'adult_data.csv', 'Bytes': 3932160},
{'Name': 'adult_data.csv', 'Bytes': 3974475},
{'Name': 'adult_data.csv',
'Hash': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V',
'Size': '3975476'}]
Retrieving a dataset from IPFS using public node
= 'public' api.change_gateway_type
Changed to public node
= api.cat_items('QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V') res, obj
with open('output/adult_data_dl.csv', 'wb') as f:
f.write(res.content)
= pd.read_csv('output/adult_data_dl.csv') df
= [col.strip() for col in df.columns] df.columns
Building a tabular model
= RandomSplitter()(range_of(df))
splits = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cat_names = ['age', 'fnlwgt', 'education-num']
cont_names = [Categorify, FillMissing, Normalize]
procs = 'salary'
y_names = CategoryBlock() y_block
= TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
to =y_names, y_block=y_block, splits=splits)
y_names
= to.dataloaders(bs=64) dls
= tabular_learner(dls, [200,100], metrics=accuracy) learn
3, 1e-3) learn.fit_one_cycle(
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.376169 | 0.351168 | 0.838913 | 00:02 |
1 | 0.368451 | 0.349306 | 0.836456 | 00:02 |
2 | 0.357292 | 0.343430 | 0.840756 | 00:02 |
'output/testmodel.pkl') learn.export(
Adding model+config_files to IPFS
= 'infura' api.change_gateway_type
Changed to infura node
= api.add_items('output/testmodel.pkl'); obj res, obj
[{'Name': 'testmodel.pkl', 'Bytes': 242379},
{'Name': 'testmodel.pkl',
'Hash': 'QmR77qXp7CYEg6kHA3z77mcayTmm9hoXz7YQHFz9WjphiE',
'Size': '242393'}]
Retrieving model+config_files from IPFS
= 'public' api.change_gateway_type
Changed to public node
= api.cat_items('QmR77qXp7CYEg6kHA3z77mcayTmm9hoXz7YQHFz9WjphiE') res, obj
with open('output/testmodel_dl.pkl', 'wb') as f:
f.write(res.content)
= load_learner('output/testmodel_dl.pkl') learn
= learn.dls.test_dl(df.iloc[:10]) dl
=dl) learn.get_preds(dl
(tensor([[0.9282, 0.0718],
[0.3556, 0.6444],
[0.9667, 0.0333],
[0.8731, 0.1269],
[0.4372, 0.5628],
[0.1317, 0.8683],
[0.9819, 0.0181],
[0.5074, 0.4926],
[0.8733, 0.1267],
[0.1803, 0.8197]]),
tensor([[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1]], dtype=torch.int8))