Using IPFS in ML with ipfspy.ipfspec

A Tutorial using IPFS and ML (using fastai)

In this tutorial, we will see how we can use IPFS as the storage for datasets and model in ML workflow.

from fastai.tabular.all import *
from ipfspy.ipfsspec.asyn import AsyncIPFSFileSystem
from fsspec import register_implementation
import asyncio
import io
import fsspec
import os
/usr/local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Add a dataset to IPFS using local node

register_implementation(AsyncIPFSFileSystem.protocol, AsyncIPFSFileSystem)

class fs:
    ipfs = fsspec.filesystem("ipfs")
    file = fsspec.filesystem("file")
Changed to local node
fs.ipfs.put(path='output/adult_data.csv', rpath='/test_dataset')
'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V'

Retrieving the dataset from IPFS using public node

if fs.file.exists('output/adult_data.csv'):
    fs.file.rm('output/adult_data.csv', recursive=True)
    
print('Before: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('test/data/dataset/output/*')])

fs.ipfs.get(rpath='QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V', 
            lpath='output/adult_data.csv', # a filename must be given
            recursive=True, 
            return_cid=False)

print('After: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('output/*')])
Before:  []
After:  ['output/.ipynb_checkpoints', 'output/adult_data.csv', 'output/fol1', 'output/get', 'output/get_file', 'output/get_folder', 'output/test.txt', 'output/test2.txt', 'output/test3.txt']

Building a tabular model

df = pd.read_csv('output/adult_data.csv')
df.columns = [col.strip() for col in df.columns]
splits = RandomSplitter()(range_of(df))
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
y_block = CategoryBlock()
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, y_block=y_block, splits=splits)

dls = to.dataloaders(bs=64)
learn = tabular_learner(dls, [200,100], metrics=accuracy)
learn.fit_one_cycle(3, 1e-3)
epoch train_loss valid_loss accuracy time
0 0.375601 0.359781 0.843366 00:03
1 0.359697 0.348975 0.842905 00:02
2 0.346300 0.346158 0.844134 00:02
learn.export('output/testmodel.pkl')

Adding model+config_files to IPFS

fs.ipfs.put(path='output/testmodel.pkl', rpath='/test_model')
'QmVoD2Bxm7hAZ9BGEg8DeSLstakhfUq1vZouVnwa1zMode'

Retrieving model+config_files from IPFS

if fs.file.exists('output/testmodel.pkl'):
    fs.file.rm('output/testmodel.pkl', recursive=True)
    
print('Before: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('output/*')])

fs.ipfs.get(rpath='QmSo4beNV5LAr166yZRvy7TNRmCtX4HXyiXqECVvDD6bnt', 
            lpath='output/testmodel.pkl', # a filename must be given
            recursive=True, 
            return_cid=False)

print('After: ', [p.lstrip(os.getcwd()) for p in fs.file.glob('output/*')])

Doing inference with retrieved model

learn = load_learner('output/testmodel.pkl')
dl = learn.dls.test_dl(df.iloc[:10])
learn.get_preds(dl=dl)
(tensor([[0.9134, 0.0866],
         [0.2455, 0.7545],
         [0.9745, 0.0255],
         [0.9120, 0.0880],
         [0.4120, 0.5880],
         [0.1225, 0.8775],
         [0.9719, 0.0281],
         [0.5242, 0.4758],
         [0.8494, 0.1506],
         [0.1704, 0.8296]]),
 tensor([[0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [1],
         [1]], dtype=torch.int8))