How to import a CSV file and perform TSNE before plotting results

158 Views Asked by At

I have a CSV file (data matrix) in this format:

string, PC1, .......

There's ~50 principal components (PC1 ...) in all and several thousand rows.

The string is simply donating the 'group' or cluster the given record belongs too. And the PCs are the principal components.

I'd like to:

  1. Read this data in via Python.
  2. Perform TSNE over the PCs (excluding the group string of course) using 'from sklearn.manifold import TSNE'
  3. Plot the results such that the points are colored by their respective group (with MatPlotLib ideally).

I can reproduce these results using the example here. How can I do this for my own data set?

Here's what I have so far:

import numpy as np

f = open("data.csv")
f.readline()  # skip the header
data =  np.loadtxt(fname = f, delimiter = ',')

X = data[:, 1:]  # select columns 1 through end
Y = data[:, 0]   # select column 0, Group

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(X)

# No errors so far, but not sure things were inputted right.
# Now onto the plot ... 

# I don't know how to get this to plot. Something like this? 
# I know my variables are not correct but I am not sure how to do 
# it.

target_ids = range(len(digits.target_names))

from matplotlib import pyplot as plt
plt.figure(figsize=(6, 5))
colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 
'purple'
for i, c, label in zip(target_ids, colors, digits.target_names):
    plt.scatter(X_2d[y == i, 0], X_2d[y == i, 1], c=c, 
label=label)
plt.legend()
plt.show()

data.csv

Here's a sample of the data matrix ==> Data

Cluster,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
Cluster_1,-4.19074866305679,-2.3114790167409,9.39927280204664,-0.593642072861712,1.17176193948157,-0.742145836155685,4.43802591864486,-0.652446722813818,-0.920947477846146,2.21400321753458
Cluster_1,1.98257946728062,-0.278838997160163,3.64679494860449,-3.20365393464409,-6.16873016218339,2.23385431309391,2.07383595815928,-0.293442303384028,0.380610812363262,1.039360444384
Cluster_1,-1.85384258784488,-0.900414283696723,5.99139541488598,-0.554038293773004,-1.71151477060067,1.47538870670287,0.483527275901269,-1.62025319461863,-0.813362897457808,-5.3318412835307
Cluster_1,-3.9447682972407,2.50114991002603,7.93530000360361,1.03651790424733,3.85068283425446,0.644722114378904,-2.78048320822965,-0.410997034150104,-0.77854588286045,-3.62269572641588
Cluster_1,-0.342459300820741,3.89931160055743,9.83488486014281,0.684417499888137,-2.24477516001519,-0.53650782036905,3.6239827125378,2.24948903160622,-10.1465863144131,1.8076501318821
Cluster_1,-6.80167059960843,3.84810142631941,3.74935719217606,0.643109860354946,-1.51312267724416,-1.94600244477288,1.2474070502925,-1.30455830012494,-5.26075644468245,1.19576840045512
Cluster_1,-5.21396892049378,1.37009411000211,4.22323967278901,-0.837171508432166,3.6152127742526,-2.26887906668213,-1.51127917313272,-2.58795948987888,-1.36237969989142,0.463118513963812
Cluster_1,-6.45668038630165,0.482789577218896,4.15764793813229,-0.856977296613979,3.92517640194495,-2.14102835556348,-1.32628670685919,-2.74598008086183,-1.2212502492269,0.862236861572023
Cluster_1,-3.80889235737749,0.32412216306602,6.11759883885916,1.41391673327865,2.19441239582403,-1.05174901575836,1.79209113939139,0.189262431381358,-1.16465711604668,0.120123725798547
Cluster_1,-6.17688129997385,-1.19952012807139,4.98986161953552,-1.24019448134547,2.23101350100076,1.43739231407412,-1.0078786177793,-1.8118048146765,-0.658666514716288,-3.32655456991394
Cluster_1,3.94256605628196,0.771492324322537,5.18601341376339,-0.581305552510033,-3.52685215268308,0.047297992398,0.045907291911,-1.27595257754982,-2.61418653290578,5.98750645258197
Cluster_1,-10.8726884490941,8.64137736454019,-0.687352582869544,-1.68157404014411,-0.385657532527084,2.77690262201275,0.285921970406374,-4.14280875714474,2.68258147917207,-0.012904845632
Cluster_1,1.19895775068216,11.6714121865598,2.55875975818913,-3.48704963723714,-6.95121572024931,0.332941825897971,2.86661069107883,-4.26582841952989,-6.35917828195582,1.95608437634731
Cluster_1,-0.18793444333629,-3.16007405708283,6.15443547770981,-2.03279852935587,-4.46296836767397,0.901906618410912,0.214743157742514,-4.00137560033691,0.970752471841541,2.75302220394964
Cluster_1,-1.61212714374558,6.70096800889067,10.9387530920283,-1.98704177135192,-1.0520318258725,2.09479999952944,2.5868146448387,1.37631489510221,-4.27036676873051,3.44911043363636
Cluster_1,-8.26554379005489,10.191074484372,-0.193342174605889,2.24385399001903,0.666977756445278,-1.7838481060193,0.143932161949605,0.256936978137158,5.07807982022143,1.38379258185045
Cluster_2,-3.22722479625242,1.91794832679299,-12.1449273744533,-6.03819107652502,-3.11281229838759,-1.06024511750338,-4.04337007064818,-3.55817487749048,-2.98922342297734,1.56596624717256
Cluster_2,-4.96377361568073,4.64616309705601,-4.75711585016047,-5.5843914853725,1.06072779211949,2.36770211172687,-1.45870072088272,-3.81637287809334,-0.758704928767302,-0.878329910298221
Cluster_2,-0.442664046400845,-11.1761256059351,0.965553541395181,-4.04575093788783,4.71227865340665,-0.28424324617405,-1.2998039698697,2.59434507669181,-0.142259898744206,0.782833506870817
Cluster_2,-3.26575591145758,3.52637932555573,-9.82416507896256,-1.1056642919943,-2.552246029301,2.46263674385783,-2.08943743528241,-3.97066033078793,-2.37161121502816,0.918423340571944
Cluster_2,-5.19773139588515,-7.4586201814046,-2.09779570264301,-2.76968495918963,4.2818507236622,-1.97848208528396,-2.8504951172741,1.9215801013077,-1.36999464183777,0.688276367477153
Cluster_2,-1.51240839733992,-2.0659608927962,-5.44126020887604,-4.13472780158545,-0.752166756429524,-1.03228164457759,-1.88655084230675,1.93398554656474,0.072874310015,0.732333801518322
Cluster_3,-9.37879542217098,5.79438203635879,-4.08215063980818,-0.161427408682977,0.708354970355071,4.08056233302452,1.68841528056347,-0.77454383854525,3.06476225908085,-0.543884959044934
Cluster_4,1.76357586377601,9.8644419006187,-1.23330034894043,3.62001774290015,-6.99720098249551,-12.9849197171815,0.384845100669861,-2.16969573251872,-2.34424627798988,-4.37443998423859
Cluster_4,-3.59258087710534,10.4202772567947,-2.03178626493388,3.36894410629419,-6.21808969444302,-10.3960115936334,0.744683342387199,-2.71619871314395,-0.864319687866104,-3.88433786772862
Cluster_4,-2.77137676463244,7.11156086639301,-3.79346525735018,3.98854610550484,-2.74266937716748,-2.69895252925707,1.42580921520308,0.52341237760177,-0.653073692263479,-4.17837867324076
Cluster_5,-7.20843845239323,4.88354093174613,-8.25416845174307,1.80819357538186,0.962829692285245,-1.62723424869167,-2.99562260678368,0.666329526507536,0.790501236200662,3.13922484983811
Cluster_5,-8.77999049732949,11.3746625338863,-6.8051921297685,0.286319241261655,-0.046085121267,-3.29586449891124,-1.81296455760835,1.08806117959378,-0.292823980511707,0.864695307133798
Cluster_5,-6.54991900000631,3.36846398949723,-9.09925966428109,3.49207923525609,-0.753413590124969,-3.44279964345711,-2.82870586907551,0.609577442955855,0.599052608112118,3.82840035520411
Cluster_5,-7.3268088907519,5.11458676125091,-8.7506448965699,2.40325840235591,-1.36392218751903,-5.4327583832764,-2.31711264893077,-0.510174500659689,1.76983549409799,2.88476178937443
Cluster_5,-5.46141590197516,2.0140081061589,4.10151129440287,2.83820079747994,4.03718244699356,0.442150118887099,-2.60223701341031,0.393915109565642,2.67853799939027,0.082167162056
Cluster_5,-2.43412513828487,9.95999416974672,-1.67012491378917,3.585888191714,-1.84704231642318,1.08212528811407,-4.1505062634571,3.02362690426123,0.689355284068352,-1.46576217598739
Cluster_6,-9.25788624654966,-0.717917440640267,-2.66446959661923,-2.39158428597075,-2.72110417023728,1.91861841609852,-2.8655375215658,-0.353833906087788,-0.583208899573201,-3.31496993204513
Cluster_6,-7.27404957292116,-0.659371591160479,-3.18597399768597,-3.13180384735567,0.626785604992769,2.09888672736825,-1.82014579499474,-0.660504190234661,1.39803405614371,-1.73512253867803
Cluster_6,-8.80325321294874,-0.538523555443754,-2.31289892469245,-1.86295781312349,-0.631461764223445,3.00495571456239,-3.02882939365397,0.368737508727169,-2.07988208798835,-2.77946921723324
Cluster_7,-10.0637122775403,11.1365054712427,1.78290785415837,1.60756545833496,1.5627952575943,1.11225809236298,1.23565189005447,-0.6708079180048,3.24468006354534,0.126386364354851
Cluster_7,-6.20156729625569,7.08924238610286,-0.253183760045115,2.88208048146755,-0.111391216756008,2.65713332130236,1.29819021178623,0.884225144000369,2.95679253119527,-0.265965725238587
Cluster_7,-1.24471615805629,9.4517509729273,-1.41004721068876,2.25809623849838,2.92215904399965,3.04411118112857,2.6463149821752,1.23756451797625,2.47889647701158,-1.10612017877547
Cluster_7,-3.94604685792285,8.19437052733983,-0.364323686934766,-0.989538114346931,-0.168248475990964,5.481414049687,2.3030869343218,-1.61805075712739,0.081426309599,-0.818918577909746
Cluster_7,-8.19316875803613,4.37377265193872,1.87196601084966,-2.20852725042847,0.595722716621586,4.19423305398032,-0.218530747532872,-4.57427411578954,2.78603384073742,1.16832861052443
Cluster_8,-2.31264351166882,-7.04584865681652,-5.80829925156119,0.652171811447077,-1.42298653546138,5.14245364391508,-3.60136451672907,4.8235872664965,-0.748067901510591,0.579252892100029
Cluster_8,-4.02175955998037,-10.5369974967429,-2.66217099906338,-2.07104969480524,1.19171263418716,2.60549575036903,-3.4232934078825,4.58787019039288,-1.22969573176572,0.417092790073341
Cluster_8,-0.450729641777152,-8.06097592851923,-1.93179576538404,-1.54316693126295,0.388218836193878,3.38170489712624,-2.71530155011698,4.31724570572477,-1.35144375920143,0.063740536349
Cluster_8,-1.92332005408079,5.19734607988687,-0.312395871457188,0.572522376300537,-1.1333006427967,4.31509650792092,-3.5652380290365,-1.84874534768595,0.36236195718064,-0.136695641648831
Cluster_8,-2.87162160324038,1.89940370041004,2.91840872156807,1.1275985084963,-2.93874880794748,3.31814482083075,2.24831308156032,-1.06629800604837,2.74742408298916,-1.05034002335024
Cluster_8,-1.70340903801383,5.21477341787199,1.64106521238798,2.35642383698622,-0.12158170169125,3.25047706393128,-4.97005593660535,-1.82759118270173,0.231919237249706,-0.84751100955402
Cluster_8,1.33997855336802,-5.92669400538312,-5.97829487344513,2.23688015975616,-0.863455132847162,5.17545799327032,-1.45577423852731,4.76723343881852,-1.8445248909068,-0.400732237415616
Cluster_8,-4.96831285051317,-2.50950031682062,-3.51089432808703,-2.86580357998906,6.41675178023052,3.61747895380617,-0.766572062185916,1.06298614325571,1.07873098947964,0.171707042205157
Cluster_8,-7.28187820533421,-2.9297859808995,-10.1158776354913,0.416066382560798,-1.01668573330195,2.32817367965705,-1.0252824307583,3.61210131881915,-0.429069125853988,-0.92968738097663
Cluster_8,11.6017472755769,-6.86558966517621,-0.66626026265831,1.61071741684475,-4.11900733858489,4.23598096472145,-4.29457565883984,1.3004818011327,-4.39198489152848,-0.430078453359456
Cluster_9,-15.4727117790432,4.40625531762835,-3.24012853873679,1.08057877194935,-2.40695500565692,-0.442045908204792,2.15988854238068,-5.48417340194856,3.69032715175745,0.113485601252325
Cluster_10,-0.920283294805583,-6.20814427214578,-7.13574814688889,12.1230250144517,-1.48508563055724,2.35854151737953,1.39270374583192,-1.56891180596862,-0.763064968865743,1.81645346104486
Cluster_10,-2.39091777710151,-7.72012717441284,-5.8127307830636,12.3294041180726,-0.096161790087,2.31654801910331,0.188299815468409,-2.49322833135978,-1.09411067291458,3.44416617458784
Cluster_10,-2.43018012395959,-7.81686095976607,-6.62612720299291,12.2213477725109,0.701858633150414,3.95203446434868,-0.178763096659237,-1.81028308577515,-1.11243369939956,3.41526273274297
Cluster_10,1.89784723829513,-6.58615668062996,-4.1361838259426,10.6032768022298,-1.88500418334025,2.87597317060192,-2.24070144611243,-3.94454658667753,-1.97710094306405,3.96200646383223
Cluster_10,2.01402139375662,-8.67580096870861,-5.07528997977367,15.1016749299745,-0.592234777981882,7.55743325361771,-1.16950033839491,-1.275404577075,-1.99316124096284,3.38460040476807
Cluster_10,-1.76980618597936,-8.66836620743037,-2.96955219076149,10.6190461488662,1.02340818905191,5.72921648911787,1.32191907702445,-1.59027193124414,-0.717041533590361,3.27784819648231
Cluster_10,0.184682969691294,-6.79109333296552,-4.92347393742612,13.2698145354908,-0.377953567875509,2.88902871928706,0.051150223291,-2.79518240321823,-0.787717416260652,2.89964555230566
Cluster_10,-2.08040374399044,-8.54335448536108,-6.10936123721588,12.285281320797,0.624842694702065,3.09507434143223,-0.158645979656603,-2.26148955767154,-1.24630505314498,3.78734413859284
Cluster_10,-1.05316103528053,-5.34178516213449,-4.98560267547929,11.4203555821621,0.757657640551918,3.75022757390374,-0.724445746007144,-1.87933874678006,-0.573861360255399,2.77027025061986
Cluster_10,0.764117161556056,-4.65407563059615,-5.46006249183363,11.7259588090155,1.87026234175691,3.90729833327399,2.39412337452117,-1.25984492854783,-2.69753247803829,2.75202418524268
Cluster_10,25.6189385885457,-7.303493764982,-6.11961128422636,10.0514487035403,3.12712782803259,2.54369066195579,6.03989036207753,-2.19770635596702,-1.17895756061134,-2.00490242089319
Cluster_10,23.6896597436665,-7.12492411580007,-7.2224545056319,9.26835864430006,4.91588714600237,3.12039413319027,6.65301695786195,-1.79192580742878,-1.59031168080547,-1.17474485956183
Cluster_10,2.61786172474484,-6.14896481556833,-2.51971256396999,13.3711272494905,-1.40954621071644,-0.299786999687809,-2.42100480703838,-2.21571123120266,0.267739757574326,4.74902835634352
Cluster_10,3.99841842924889,-2.33990157674365,-4.80685564190575,11.2074750108296,3.03486620516681,5.1979453142878,0.39967816534143,-2.28295858455617,-1.08554155536366,1.25136387739084
Cluster_10,4.11925688926564,-6.11118925343565,-2.78156156539564,13.6399873031493,-2.48725935686101,3.75900617675867,-1.93031093973529,-1.00062093896315,-2.09201409151273,1.22014085212784
Cluster_10,1.89719843522385,-9.69947257318146,-1.55039668453206,9.29327914691624,1.19779782151667,3.91336672385837,0.17137163219697,-1.60097945473396,-0.758753849581679,1.92784932626501
Cluster_10,0.5627489340048,-6.98800042143365,-3.54023619538625,13.0049530781971,-0.488178276647255,4.45509667306942,-1.37432533902106,-2.2295006699617,-0.873807644540273,2.24421544552248
Cluster_10,-3.21534691916798,-6.58958369444703,-5.40918898530161,11.0792564351361,0.784194502022903,3.85546103331184,-0.555964130156387,-2.00495457566714,-0.845193524089086,2.89873193865419
Cluster_10,2.76047501821989,-6.69061368574042,-3.70377418467318,14.1769104874953,-2.27850206742421,1.98585006670746,-0.330510502652474,-3.13552592280856,-0.234129245170327,2.12309212700537
Cluster_10,-1.93058167467423,-7.78332611686089,-4.26131495973219,12.62006389593,-0.929215344895555,2.9769426286792,-0.948345124089805,-2.91492345396226,-0.667066795857259,3.07463415626867
Cluster_10,-4.69038703702806,-6.01706039885766,-3.52695659277151,6.3029877038907,4.30338216004237,4.05339343192112,0.109505954528486,-0.792140172829522,-0.27584475366521,1.97442867217359
Cluster_10,12.2517758911169,-14.5648706995795,-10.4664917603547,9.22394481140486,-7.95536718465381,-0.841859836089486,0.687623530587227,2.03453306558197,-0.330870869741335,0.944034004383818
Cluster_10,7.25822288237564,-7.4336780491792,3.51807229393836,9.47934276508736,0.312172046137646,1.65206836682391,6.94077872095417,-2.58434050246442,-12.1869079942978,4.65109090936497
1

There are 1 best solutions below

1
Joyce On
import pandas as pd
import matplotlib.plot.pyplot as plt
from sklearn.manifold import TSNE

data = pd.read_csv('./data.csv') # Replace ./data.csv with your file path.
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(data.iloc[:, 1:])
data[['x', 'y']] = X_2d
target_names = data['Cluster'].unique()

colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'gray', 'orange', 'purple'
fig, ax = plt.subplots(figsize=(8, 6))
for color, label in zip(colors, target_names):
    selected = data[data.Cluster.eq(label)]
    ax.scatter(x='x', y='y', data=selected, c=color, label=label)
ax.legend(bbox_to_anchor=(1, 0.5), loc='center left', frameon=False)
plt.show()

enter image description here

if you don't have pandas library

import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Read data from CSV
data = []
with open('./data.csv', 'r') as f:  # Replace ./data.csv with your file path.
    reader = csv.reader(f)
    for row in reader:
        data.append(row)

data = np.array(data[1:], dtype=float)

# TSNE transformation
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(data[:, 1:])

# Plotting
target_names = set(data[:, 0])  
plt.figure(figsize=(6, 5))
colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'gray', 'orange', 'purple'
for i, label in enumerate(target_names):
    idx = np.where(data[:, 0] == label)
    plt.scatter(X_2d[idx, 0], X_2d[idx, 1], c=colors[i], label=label)
plt.legend()
plt.show()