Download expression data#

import GEOparse
import pandas as pd
import numpy as np
import re
from pathlib import Path
import pylab as pl
import seaborn as sns
pl.rcParams['figure.figsize'] = (14, 10)
pl.rcParams['ytick.labelsize'] = 12
pl.rcParams['xtick.labelsize'] = 11
pl.rcParams['axes.labelsize'] = 23
pl.rcParams['legend.fontsize'] = 20
sns.set_style('ticks')
c1, c2, c3, c4 = sns.color_palette("Set1", 4)
# !pip install GEOparse
Dir_Expression = "1_Expression_data/"
Path(Dir_Expression).mkdir(parents=True, exist_ok=True)
Dir_WGCNA = "2_WGCNA_data/"
Path(Dir_WGCNA).mkdir(parents=True, exist_ok=True)
Dir_GRN = "3_GRN_data/"
Path(Dir_GRN).mkdir(parents=True, exist_ok=True)

Download data#

URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74488/suppl/GSE74488_sc_expression.csv.gz'
!curl {URL} -O GSE74488_sc_expression.csv.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 6608k    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 52 6608k   52 3440k    0     0  259
8k      0  0:00:02  0:00:01  0:00:01 2596k
100 6608k  100 6608k    0     0  3186k      0  0:00:02  0:00:02 --:--:-- 3186k
curl: (6) Could not resolve host: GSE74488_sc_expression.csv.gz
df = pd.read_csv("GSE74488_sc_expression.csv.gz")
df.head()
Locus wolsc_kb2_4_1 wolsc_kb2_4_10 wolsc_kb2_4_11 wolsc_kb2_4_13 wolsc_kb2_4_14 wolsc_kb2_4_15 wolsc_kb2_4_18 wolsc_kb2_4_19 wolsc_kb2_4_22 ... wolsc_kb3_2_1 sc_0113_pa_19 sc_0113_pa_3 sc_0113_pa_44 sc_0113_pa_52 sc_0113_pa_58 sc_0113_pa_59 sc_0113_pa_60 sc_0113_pa_68 sc_0113_pa_83
0 AT1G01010 0.000000 7.702431 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000
1 AT1G01020 8.378906 0.000000 0.0 4.298833 0.0 0.0 0.0 0.0 0.0 ... 0.0 2.275709 0.0 0.0 0.0 0.0 3.614329 0.0 4.642478 3.406784
2 AT1G01030 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000
3 AT1G01040 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000
4 AT1G01046 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000

5 rows × 239 columns

URL = "https://www.cell.com/cms/10.1016/j.cell.2016.04.046/attachment/ccb8f6e8-4822-4e06-9400-2eccfd98dd56/mmc4.xlsx"
!curl {URL} -O mmc4.xlsx
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 26662  100 26662    0     0  35313      0 --:--:-- --:--:-- --:--:-- 35267
curl: (6) Could not resolve host: mmc4.xlsx
Metadata = pd.read_excel('mmc4.xlsx', header=1)
Metadata = Metadata[["Cell", "Timepoint", "Identity"]]
Metadata.Timepoint.value_counts()
46hpc    86
3hpc     67
16hpc    55
Uncut    30
Name: Timepoint, dtype: int64

Uncut#

Metadata_Uncut = Metadata[Metadata["Timepoint"] == "Uncut"]
print(Metadata_Uncut.shape)
Dic_uncut = {a:a + "_" + c for a, b, c in Metadata_Uncut.values.tolist()}
Metadata_Uncut.head()
(30, 3)
Cell Timepoint Identity
0 wolsc_kb2_4_10 Uncut Pericycle
1 wolsc_kb2_4_1 Uncut Vasculature
2 wolsc_kb2_4_18 Uncut Vasculature
3 wolsc_kb2_4_22 Uncut Vasculature
4 wolsc_kb2_4_26 Uncut Vasculature
df_Uncut = df[['Locus'] + list(Metadata_Uncut.Cell.tolist())]
# Gene_ID, GeneName
df_Uncut.to_csv(Dir_Expression+"Expr_Uncut.csv", index=False)
df_Uncut = df_Uncut.rename(columns={"Locus":"Gene_ID"})
df_Uncut = df_Uncut.copy()
df_Uncut['Gene_name'] = df_Uncut['Gene_ID']
# Rearrange
df_Uncut_cols = ['Gene_ID', 'Gene_name']+[i for i in list(df_Uncut) if i not in ['Gene_ID', 'Gene_name']]
df_Uncut = df_Uncut[df_Uncut_cols]
df_Uncut = df_Uncut.rename(columns=Dic_uncut)
df_Uncut.to_csv(Dir_WGCNA+"WGCNA_input_Uncut.csv", index=False)
df_Uncut.head()
Gene_ID Gene_name wolsc_kb2_4_10_Pericycle wolsc_kb2_4_1_Vasculature wolsc_kb2_4_18_Vasculature wolsc_kb2_4_22_Vasculature wolsc_kb2_4_26_Vasculature wolsc_kb2_4_27_Vasculature wolsc_kb2_4_30_Vasculature wolsc_kb2_4_41_Vasculature ... wolsc_kb2_4_11_Unknown wolsc_kb2_4_13_Unknown wolsc_kb2_4_14_Unknown wolsc_kb2_4_15_Unknown wolsc_kb2_4_19_Unknown wolsc_kb2_4_24_Unknown wolsc_kb2_4_66_Unknown wolsc_kb2_4_76_Unknown wolsc_kb2_4_78_Unknown wolsc_kb2_4_80_Unknown
0 AT1G01010 AT1G01010 7.702431 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
1 AT1G01020 AT1G01020 0.000000 8.378906 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 4.298833 0.0 0.0 0.0 0.0 0.0 0.0 11.596565 0.0
2 AT1G01030 AT1G01030 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
3 AT1G01040 AT1G01040 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
4 AT1G01046 AT1G01046 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0

5 rows × 32 columns

3hpc#

Metadata_3hpc = Metadata[Metadata["Timepoint"] == "3hpc"]
print(Metadata_3hpc.shape)
Dic_3hpc = {a:a + "_" + c for a, b, c in Metadata_3hpc.values.tolist()}
Metadata_3hpc.head()
(67, 3)
Cell Timepoint Identity
30 sc_1228_pa_30 3hpc Epidermis\LRC
31 wolsc_kb2_3_13 3hpc Mixed distal
32 wolsc_kb2_3_14 3hpc Mixed distal
33 wolsc_kb2_3_2 3hpc Mixed distal
34 wolsc_kb2_3_27 3hpc Mixed distal
df_3hpc = df[['Locus'] + list(Metadata_3hpc.Cell.tolist())]
df_3hpc.to_csv(Dir_Expression+"Expr_3hpc.csv", index=False)
# Gene_ID, GeneName
df_3hpc = df_3hpc.rename(columns={"Locus":"Gene_ID"})
df_3hpc = df_3hpc.copy()
df_3hpc['Gene_name'] = df_3hpc['Gene_ID']
# Rearrange
df_3hpc_cols = ['Gene_ID', 'Gene_name']+[i for i in list(df_3hpc) if i not in ['Gene_ID', 'Gene_name']]
df_3hpc = df_3hpc[df_3hpc_cols]
df_3hpc = df_3hpc.rename(columns=Dic_3hpc)
df_3hpc.to_csv(Dir_WGCNA+"WGCNA_input_3hpc.csv", index=False)
df_3hpc.head()
Gene_ID Gene_name sc_1228_pa_30_Epidermis\LRC wolsc_kb2_3_13_Mixed distal wolsc_kb2_3_14_Mixed distal wolsc_kb2_3_2_Mixed distal wolsc_kb2_3_27_Mixed distal wolsc_kb2_3_51_Mixed distal sc_1228_pa_14_Mixed distal sc_1228_pa_86_Mixed distal ... sc_1228_pb_5_Unknown sc_1228_pb_70_Unknown sc_1228_pb_78_Unknown sc_1228_pb_86_Unknown sc_1228_pb_93_Unknown sc_1228_pa_36_Unknown sc_1228_pa_57_Unknown sc_1228_pa_77_Unknown sc_1228_pa_78_Unknown sc_1228_pa_85_Unknown
0 AT1G01010 AT1G01010 0.0 0.000000 0.000000 3.829904 0.0 0.000000 0.000000 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000
1 AT1G01020 AT1G01020 0.0 7.092747 5.949744 7.912041 0.0 6.881387 3.328156 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 2.212596
2 AT1G01030 AT1G01030 0.0 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000
3 AT1G01040 AT1G01040 0.0 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 3.963564 0.0 0.0 0.000000
4 AT1G01046 AT1G01046 0.0 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000

5 rows × 69 columns

END