Download expression data
Contents
Download expression data#
import GEOparse
import pandas as pd
import numpy as np
import re
from pathlib import Path
import pylab as pl
import seaborn as sns
pl.rcParams['figure.figsize'] = (14, 10)
pl.rcParams['ytick.labelsize'] = 12
pl.rcParams['xtick.labelsize'] = 11
pl.rcParams['axes.labelsize'] = 23
pl.rcParams['legend.fontsize'] = 20
sns.set_style('ticks')
c1, c2, c3, c4 = sns.color_palette("Set1", 4)
# !pip install GEOparse
Dir_Expression = "1_Expression_data/"
Path(Dir_Expression).mkdir(parents=True, exist_ok=True)
Dir_WGCNA = "2_WGCNA_data/"
Path(Dir_WGCNA).mkdir(parents=True, exist_ok=True)
Dir_GRN = "3_GRN_data/"
Path(Dir_GRN).mkdir(parents=True, exist_ok=True)
Download data#
URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74488/suppl/GSE74488_sc_expression.csv.gz'
!curl {URL} -O GSE74488_sc_expression.csv.gz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 6608k 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
52 6608k 52 3440k 0 0 259
8k 0 0:00:02 0:00:01 0:00:01 2596k
100 6608k 100 6608k 0 0 3186k 0 0:00:02 0:00:02 --:--:-- 3186k
curl: (6) Could not resolve host: GSE74488_sc_expression.csv.gz
df = pd.read_csv("GSE74488_sc_expression.csv.gz")
df.head()
Locus | wolsc_kb2_4_1 | wolsc_kb2_4_10 | wolsc_kb2_4_11 | wolsc_kb2_4_13 | wolsc_kb2_4_14 | wolsc_kb2_4_15 | wolsc_kb2_4_18 | wolsc_kb2_4_19 | wolsc_kb2_4_22 | ... | wolsc_kb3_2_1 | sc_0113_pa_19 | sc_0113_pa_3 | sc_0113_pa_44 | sc_0113_pa_52 | sc_0113_pa_58 | sc_0113_pa_59 | sc_0113_pa_60 | sc_0113_pa_68 | sc_0113_pa_83 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT1G01010 | 0.000000 | 7.702431 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
1 | AT1G01020 | 8.378906 | 0.000000 | 0.0 | 4.298833 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 2.275709 | 0.0 | 0.0 | 0.0 | 0.0 | 3.614329 | 0.0 | 4.642478 | 3.406784 |
2 | AT1G01030 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
3 | AT1G01040 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
4 | AT1G01046 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
5 rows × 239 columns
URL = "https://www.cell.com/cms/10.1016/j.cell.2016.04.046/attachment/ccb8f6e8-4822-4e06-9400-2eccfd98dd56/mmc4.xlsx"
!curl {URL} -O mmc4.xlsx
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 26662 100 26662 0 0 35313 0 --:--:-- --:--:-- --:--:-- 35267
curl: (6) Could not resolve host: mmc4.xlsx
Metadata = pd.read_excel('mmc4.xlsx', header=1)
Metadata = Metadata[["Cell", "Timepoint", "Identity"]]
Metadata.Timepoint.value_counts()
46hpc 86
3hpc 67
16hpc 55
Uncut 30
Name: Timepoint, dtype: int64
Uncut#
Metadata_Uncut = Metadata[Metadata["Timepoint"] == "Uncut"]
print(Metadata_Uncut.shape)
Dic_uncut = {a:a + "_" + c for a, b, c in Metadata_Uncut.values.tolist()}
Metadata_Uncut.head()
(30, 3)
Cell | Timepoint | Identity | |
---|---|---|---|
0 | wolsc_kb2_4_10 | Uncut | Pericycle |
1 | wolsc_kb2_4_1 | Uncut | Vasculature |
2 | wolsc_kb2_4_18 | Uncut | Vasculature |
3 | wolsc_kb2_4_22 | Uncut | Vasculature |
4 | wolsc_kb2_4_26 | Uncut | Vasculature |
df_Uncut = df[['Locus'] + list(Metadata_Uncut.Cell.tolist())]
# Gene_ID, GeneName
df_Uncut.to_csv(Dir_Expression+"Expr_Uncut.csv", index=False)
df_Uncut = df_Uncut.rename(columns={"Locus":"Gene_ID"})
df_Uncut = df_Uncut.copy()
df_Uncut['Gene_name'] = df_Uncut['Gene_ID']
# Rearrange
df_Uncut_cols = ['Gene_ID', 'Gene_name']+[i for i in list(df_Uncut) if i not in ['Gene_ID', 'Gene_name']]
df_Uncut = df_Uncut[df_Uncut_cols]
df_Uncut = df_Uncut.rename(columns=Dic_uncut)
df_Uncut.to_csv(Dir_WGCNA+"WGCNA_input_Uncut.csv", index=False)
df_Uncut.head()
Gene_ID | Gene_name | wolsc_kb2_4_10_Pericycle | wolsc_kb2_4_1_Vasculature | wolsc_kb2_4_18_Vasculature | wolsc_kb2_4_22_Vasculature | wolsc_kb2_4_26_Vasculature | wolsc_kb2_4_27_Vasculature | wolsc_kb2_4_30_Vasculature | wolsc_kb2_4_41_Vasculature | ... | wolsc_kb2_4_11_Unknown | wolsc_kb2_4_13_Unknown | wolsc_kb2_4_14_Unknown | wolsc_kb2_4_15_Unknown | wolsc_kb2_4_19_Unknown | wolsc_kb2_4_24_Unknown | wolsc_kb2_4_66_Unknown | wolsc_kb2_4_76_Unknown | wolsc_kb2_4_78_Unknown | wolsc_kb2_4_80_Unknown | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT1G01010 | AT1G01010 | 7.702431 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
1 | AT1G01020 | AT1G01020 | 0.000000 | 8.378906 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 4.298833 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.596565 | 0.0 |
2 | AT1G01030 | AT1G01030 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
3 | AT1G01040 | AT1G01040 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
4 | AT1G01046 | AT1G01046 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
5 rows × 32 columns
3hpc#
Metadata_3hpc = Metadata[Metadata["Timepoint"] == "3hpc"]
print(Metadata_3hpc.shape)
Dic_3hpc = {a:a + "_" + c for a, b, c in Metadata_3hpc.values.tolist()}
Metadata_3hpc.head()
(67, 3)
Cell | Timepoint | Identity | |
---|---|---|---|
30 | sc_1228_pa_30 | 3hpc | Epidermis\LRC |
31 | wolsc_kb2_3_13 | 3hpc | Mixed distal |
32 | wolsc_kb2_3_14 | 3hpc | Mixed distal |
33 | wolsc_kb2_3_2 | 3hpc | Mixed distal |
34 | wolsc_kb2_3_27 | 3hpc | Mixed distal |
df_3hpc = df[['Locus'] + list(Metadata_3hpc.Cell.tolist())]
df_3hpc.to_csv(Dir_Expression+"Expr_3hpc.csv", index=False)
# Gene_ID, GeneName
df_3hpc = df_3hpc.rename(columns={"Locus":"Gene_ID"})
df_3hpc = df_3hpc.copy()
df_3hpc['Gene_name'] = df_3hpc['Gene_ID']
# Rearrange
df_3hpc_cols = ['Gene_ID', 'Gene_name']+[i for i in list(df_3hpc) if i not in ['Gene_ID', 'Gene_name']]
df_3hpc = df_3hpc[df_3hpc_cols]
df_3hpc = df_3hpc.rename(columns=Dic_3hpc)
df_3hpc.to_csv(Dir_WGCNA+"WGCNA_input_3hpc.csv", index=False)
df_3hpc.head()
Gene_ID | Gene_name | sc_1228_pa_30_Epidermis\LRC | wolsc_kb2_3_13_Mixed distal | wolsc_kb2_3_14_Mixed distal | wolsc_kb2_3_2_Mixed distal | wolsc_kb2_3_27_Mixed distal | wolsc_kb2_3_51_Mixed distal | sc_1228_pa_14_Mixed distal | sc_1228_pa_86_Mixed distal | ... | sc_1228_pb_5_Unknown | sc_1228_pb_70_Unknown | sc_1228_pb_78_Unknown | sc_1228_pb_86_Unknown | sc_1228_pb_93_Unknown | sc_1228_pa_36_Unknown | sc_1228_pa_57_Unknown | sc_1228_pa_77_Unknown | sc_1228_pa_78_Unknown | sc_1228_pa_85_Unknown | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT1G01010 | AT1G01010 | 0.0 | 0.000000 | 0.000000 | 3.829904 | 0.0 | 0.000000 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 |
1 | AT1G01020 | AT1G01020 | 0.0 | 7.092747 | 5.949744 | 7.912041 | 0.0 | 6.881387 | 3.328156 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 2.212596 |
2 | AT1G01030 | AT1G01030 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 |
3 | AT1G01040 | AT1G01040 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.963564 | 0.0 | 0.0 | 0.000000 |
4 | AT1G01046 | AT1G01046 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 |
5 rows × 69 columns
END