I plotted categorical data by converting classes into numbers (taking inspiration from another stackoverflow question here). Is it possible to do something similar in bokeh (something like this example)? The main advantage with sns is minimal processing of dataframe. While reading the example above, it would be rather problematic for me to label the axes if I create a function. What I'm essentially aiming for is a function which will create a heatmap like plot by taking index on one axis and columns on another axis.
My current code:
import gzip, sys, getopt, shlex, subprocess, os
from pylatex import Document, Section, Subsection, Table, Math, TikZ, Axis, Plot, Figure, Package
from pylatex.utils import italic, escape_latex
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib.patches as mpatches
from pylatex import Document, Section, Figure, SubFigure, NoEscape
import os
import pandas as pd
import seaborn as sns
df = pd.read_csv('postq.csv')
df = df.groupby(['sample', 'tot.seq', 'module'])['status'].apply(', '.join).unstack().reset_index().rename_axis(None, axis=1)
mapping = {'PASS': 1, 'WARN': 2, 'FAIL': 3}
df = df.replace({'Basic Statistics': mapping, 'Per base sequence quality': mapping, 'Per tile sequence quality': mapping, 'Per sequence quality scores': mapping, 'Per base sequence content': mapping, 'Per sequence GC content': mapping, 'Per base N content': mapping, 'Sequence Length Distribution': mapping, 'Sequence Duplication Levels': mapping, 'Overrepresented sequences': mapping, 'Adapter Content': mapping,'Kmer Content': mapping})
df = df.drop('tot.seq', 1)
df = df.set_index('sample')
f, ax = plt.subplots()
hm = sns.heatmap(data = df, cmap="Pastel2", ax=ax, cbar=False)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.7, box.height])
legend_ax = f.add_axes([.7, .5, 1, .1])
legend_ax.axis('off')
colors = plt.cm.Pastel2(np.linspace(0, 1, len(mapping)))
patches = [mpatches.Patch(facecolor=c, edgecolor=c) for c in colors]
legend = legend_ax.legend(patches,
sorted(mapping),
handlelength=0.8, loc='lower left')
for t in legend.get_texts():
t.set_ha("left")
f.savefig('output.pdf', bbox_inches='tight')
And snippet of the csv file used as input:
"sample","module","status","tot.seq","seq.length","pct.gc","pct.dup"
"ERR435936_cleaned_1","Basic Statistics","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Per base sequence quality","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Per tile sequence quality","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Per sequence quality scores","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Per base sequence content","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Per sequence GC content","WARN","19823396","62",51,51.06
"ERR435936_cleaned_1","Per base N content","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Sequence Length Distribution","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Sequence Duplication Levels","FAIL","19823396","62",51,51.06
"ERR435936_cleaned_1","Overrepresented sequences","WARN","19823396","62",51,51.06
"ERR435936_cleaned_1","Adapter Content","PASS","19823396","62",51,51.06
"ERR435936_cleaned_1","Kmer Content","FAIL","19823396","62",51,51.06
"ERR435936_cleaned_2","Basic Statistics","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Per base sequence quality","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Per tile sequence quality","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Per sequence quality scores","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Per base sequence content","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Per sequence GC content","FAIL","19823396","62",51,43.33
"ERR435936_cleaned_2","Per base N content","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Sequence Length Distribution","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Sequence Duplication Levels","WARN","19823396","62",51,43.33
"ERR435936_cleaned_2","Overrepresented sequences","FAIL","19823396","62",51,43.33
"ERR435936_cleaned_2","Adapter Content","PASS","19823396","62",51,43.33
"ERR435936_cleaned_2","Kmer Content","FAIL","19823396","62",51,43.33
"ERR435937_cleaned_1","Basic Statistics","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Per base sequence quality","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Per tile sequence quality","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Per sequence quality scores","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Per base sequence content","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Per sequence GC content","WARN","23937659","62",50,48.53
"ERR435937_cleaned_1","Per base N content","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Sequence Length Distribution","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Sequence Duplication Levels","WARN","23937659","62",50,48.53
"ERR435937_cleaned_1","Overrepresented sequences","WARN","23937659","62",50,48.53
"ERR435937_cleaned_1","Adapter Content","PASS","23937659","62",50,48.53
"ERR435937_cleaned_1","Kmer Content","FAIL","23937659","62",50,48.53
My current plot looks something like this(legend order is slightly messed up):