Using IPFS in ML with `ipfspy.ipfspec`

A Tutorial using IPFS and ML (using fastai)

In this tutorial, we will see how we can use IPFS as the storage for datasets and model in ML workflow.

from fastai.tabular.all import *
from ipfspy.ipfsspec.asyn import AsyncIPFSFileSystem
from fsspec import register_implementation
import asyncio
import io
import fsspec
import os

/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Add a dataset to IPFS using local node

register_implementation(AsyncIPFSFileSystem.protocol, AsyncIPFSFileSystem)

class fs:
    ipfs = fsspec.filesystem("ipfs")
    file = fsspec.filesystem("file")

Changed to local node

fs.ipfs.put(path='output/adult_data.csv', rpath='/test_dataset')

'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V'

Retrieving the dataset from IPFS using public node

if fs.file.exists('output/adult_data.csv'):
    fs.file.rm('output/adult_data.csv', recursive=True)
    
print('Before: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('test/data/dataset/output/*')])

fs.ipfs.get(rpath='QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V', 
            lpath='output/adult_data.csv', # a filename must be given
            recursive=True, 
            return_cid=False)

print('After: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('output/*')])

Before:  []
After:  ['output/.ipynb_checkpoints', 'output/adult_data.csv', 'output/fol1', 'output/get', 'output/get_file', 'output/get_folder', 'output/test.txt', 'output/test2.txt', 'output/test3.txt']

Building a tabular model

df = pd.read_csv('output/adult_data.csv')
df.columns = [col.strip() for col in df.columns]

splits = RandomSplitter()(range_of(df))
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
y_block = CategoryBlock()

to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, y_block=y_block, splits=splits)

dls = to.dataloaders(bs=64)

learn = tabular_learner(dls, [200,100], metrics=accuracy)

learn.fit_one_cycle(3, 1e-3)

epoch	train_loss	valid_loss	accuracy	time
0	0.375601	0.359781	0.843366	00:03
1	0.359697	0.348975	0.842905	00:02
2	0.346300	0.346158	0.844134	00:02

learn.export('output/testmodel.pkl')

Adding model+config_files to IPFS

fs.ipfs.put(path='output/testmodel.pkl', rpath='/test_model')

'QmVoD2Bxm7hAZ9BGEg8DeSLstakhfUq1vZouVnwa1zMode'

Retrieving model+config_files from IPFS

if fs.file.exists('output/testmodel.pkl'):
    fs.file.rm('output/testmodel.pkl', recursive=True)
    
print('Before: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('output/*')])

fs.ipfs.get(rpath='QmSo4beNV5LAr166yZRvy7TNRmCtX4HXyiXqECVvDD6bnt', 
            lpath='output/testmodel.pkl', # a filename must be given
            recursive=True, 
            return_cid=False)

print('After: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('output/*')])

Doing inference with retrieved model

learn = load_learner('output/testmodel.pkl')

dl = learn.dls.test_dl(df.iloc[:10])

learn.get_preds(dl=dl)

(tensor([[0.9134, 0.0866],
         [0.2455, 0.7545],
         [0.9745, 0.0255],
         [0.9120, 0.0880],
         [0.4120, 0.5880],
         [0.1225, 0.8775],
         [0.9719, 0.0281],
         [0.5242, 0.4758],
         [0.8494, 0.1506],
         [0.1704, 0.8296]]),
 tensor([[0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [1],
         [1]], dtype=torch.int8))