Loops Hard [Python]

Not a vocation programmer, please excuse me if this is obvious. I cannot run a loop: / ...

I have 3 lists:

gene_concepts[0] = ['+0|+77|CFTR', '+12|+77|CYP2C19']

genes = ['CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 
'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CYP2C19', 'CYP2C19', 
'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19']

haplotypes = ['CFTR F508del(CTT)', 'CFTR F508del(TCT)', 'CFTR G1244E', 
'CFTR G1349D', 'CFTR G178R', 'CFTR G551D', 'CFTR G551S', 'CFTR S1251N', 
'CFTR S1255P', 'CFTR S549N', 'CFTR S549R(A>C)', 'CFTR S549R(T>G)', 
'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 
*10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10']

Note that the haplotypes and genes correspond (ie, the first member of the line in the CFTR haplotype list and corresponds to the first element of the list in the gene list ... so they are ordered)

I want to create a new list or just output a set of lines so that haplotypes having the same gene (so that genes can match each other or a substring of the first part of a haplotype string, depending on which specific code is assigned that is in the gene_concepts list and matches the first member before "|", the separator in the list of strings.

Desired Result:

+21|+0|CFTR F508del(CTT)
+22|+0|CFTR F508del(TCT)
+23|+0|CFTR G1244E
+24|+0|CFTR G1349D
+25|+0|CFTR G178R
+26|+0|CFTR G551D
+27|+0|CFTR G551S
+28|+0|CFTR S1251N
+29|+0|CFTR S1255P
+30|+0|CFTR S549N
+31|+0|CFTR S549R(A>C)
+32|+0|CFTR S549R(T>G)
+33|+12|CYP2C19 *10
+34|+12|CYP2C19 *10
+35|+12|CYP2C19 *10
+36|+12|CYP2C19 *10
+37|+12|CYP2C19 *10
+38|+12|CYP2C19 *10
+39|+12|CYP2C19 *10
+40|+12|CYP2C19 *10

, "+21... + 39 - temp_code_2"... , . - , . 2- .

...

def generate_haplotype_concepts(gene_concepts[0], haplotypes):
    temp_code_2 = 20
    index = 0

    for batch_line in gene_concepts[0]:
        gene_parent_code = batch_line.split("|")[0]
        gene_parent_medcodes.append(gene_parent_code)

    index_gene = 0
    index_parent_code = 0
    for gene in genes:
        if (index_gene == 0):
            print("+" + str(temp_code_2) + "|"
                  + gene_parent_medcodes[index_parent_code] + "|"
                  + haplotypes[index_gene])
            index_gene += 1
        elif (genes[index_gene] == genes[index_gene-1]):             
            print("+" + str(temp_code_2) + "|"
                  + gene_parent_medcodes[index_parent_code] + "|"
                  + haplotypes[index_gene-1])
        else:
            index_parent_code += 1
            print("+" + str(temp_code_2) + "|"
                  + gene_parent_medcodes[index_parent_code] + "|"
                  + haplotypes[index_gene])
        index_gene += 1
        temp_code_2 += 1  

generate_haplotype_concepts(gene_concepts[0], haplotypes) 

:

+21|+0|CFTR F508del(CTT)
+22|+0|CFTR F508del(TCT)
+23|+0|CFTR G1244E
+24|+0|CFTR G1349D
+25|+0|CFTR G178R
+26|+0|CFTR G551D
+27|+0|CFTR G551S
+28|+0|CFTR S1251N
+29|+0|CFTR S1255P
+30|+0|CFTR S549N
+31|+0|CFTR S549R(A>C)
+32|+12|CYP2C19 *10
+33|+12|CYP2C19 *10
+34|+12|CYP2C19 *10
+35|+12|CYP2C19 *10
+36|+12|CYP2C19 *10
+37|+12|CYP2C19 *10
+38|+12|CYP2C19 *10
+39|+12|CYP2C19 *10

2 , ... CFTR (+32 | +0 | CFTR S549R (T > G) ), " " .

-----------------------------------------------------------------------
----
IndexError                                Traceback (most recent call 
last)
<ipython-input-16-1410b2513457> in <module>()
     55 
     56 
---> 57 generate_haplotype_concepts(gene_concepts[0], haplotypes)

<ipython-input-16-1410b2513457> in 
generate_haplotype_concepts(temp_code_2, haplotypes)
     30 #                             + "\n" )
     31             index_gene += 1
---> 32         elif (genes[index_gene] == genes[index_gene-1]):
     33             print("+" + str(temp_code_2) + "|"
     34                   + gene_parent_medcodes[index_parent_code] + 
"|"

IndexError: list index out of range

, ... , , , ... !

+4
4

, - :

gene_concepts = {} # just initializes an empty dictionary to fill on the next line

gene_concepts[0] = ['+0|+77|CFTR', '+12|+77|CYP2C19']

# we don't actually end up using the genes list, since we can get the same info from the first part of each haplotype
genes = ['CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 
'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CYP2C19', 'CYP2C19', 
'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19'] 

haplotypes = ['CFTR F508del(CTT)', 'CFTR F508del(TCT)', 'CFTR G1244E', 
'CFTR G1349D', 'CFTR G178R', 'CFTR G551D', 'CFTR G551S', 'CFTR S1251N', 
'CFTR S1255P', 'CFTR S549N', 'CFTR S549R(A>C)', 'CFTR S549R(T>G)', 
'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10',
'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10']

# split the gene_concepts strings into usable pieces
gene_concept_codes = {} # create a dictionary for looking up gene_concepts by gene name
for gene_concept in gene_concepts[0]:
    pieces = gene_concept.split('|')  # this turns a string like "+0|+77|CFTR" into a list like ["+0", "+77", "CFTR"]
    gene_concept_codes[pieces[2]] = pieces # add the list to the dictionary, with the gene name as key

temp_id = 20 # arbitrary to match your values, change it however you need

# for each haplotype, match it up with the right gene_concept info
for haplotype in haplotypes:
    temp_id += 1
    gene = haplotype.split()[0] # pull out the name of the gene
    print("+{}|{}|{}".format(temp_id, gene_concept_codes[gene][0], haplotype)) # gene_concept_codes[gene] will be the list like ["+0", "+77", "CFTR"], so [0] gives us the first element of that list

:

+21|+0|CFTR F508del(CTT)
+22|+0|CFTR F508del(TCT)
+23|+0|CFTR G1244E
+24|+0|CFTR G1349D
+25|+0|CFTR G178R
+26|+0|CFTR G551D
+27|+0|CFTR G551S
+28|+0|CFTR S1251N
+29|+0|CFTR S1255P
+30|+0|CFTR S549N
+31|+0|CFTR S549R(A>C)
+32|+0|CFTR S549R(T>G)
+33|+12|CYP2C19 *10
+34|+12|CYP2C19 *10
+35|+12|CYP2C19 *10
+36|+12|CYP2C19 *10
+37|+12|CYP2C19 *10
+38|+12|CYP2C19 *10
+39|+12|CYP2C19 *10
+40|+12|CYP2C19 *10
+1

( ):

haplotypes    = ['CFTR F508del(CTT)', 'CFTR F508del(TCT)', 'CFTR G1244E', 'CFTR G1349D', 'CFTR G178R', 'CFTR G551D', 'CFTR G551S', 'CFTR S1251N', 'CFTR S1255P', 'CFTR S549N', 'CFTR S549R(A>C)', 'CFTR S549R(T>G)', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10']
gene_concepts = {'CFTR':0, 'CYP2C19':12} #Dictionaries are useful
for x in haplotypes:
  prefix       = x.split()[0] #Get prefix by splitting on spaces and looking at substring before first space
  if prefix in gene_concepts: #Do we recognize this gene concept?
    print("{0}|{1}".format(gene_concepts[prefix],x))
  else:                       #If not, inform the user
    print('Gene with unknown concept: "{0}"'.format(x))

:

0|CFTR F508del(CTT)
0|CFTR F508del(TCT)
0|CFTR G1244E
0|CFTR G1349D
0|CFTR G178R
0|CFTR G551D
0|CFTR G551S
0|CFTR S1251N
0|CFTR S1255P
0|CFTR S549N
0|CFTR S549R(A>C)
0|CFTR S549R(T>G)
12|CYP2C19 *10
12|CYP2C19 *10
12|CYP2C19 *10
12|CYP2C19 *10
12|CYP2C19 *10
12|CYP2C19 *10
12|CYP2C19 *10
12|CYP2C19 *10

, , , , . , , .

+2

, , gene_concepts . ( ), :

gene_concepts = [None]
gene_concepts[0] = ['+0|+77|CFTR', '+12|+77|CYP2C19']

genes = ['CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR',
'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CFTR', 'CYP2C19', 'CYP2C19',
'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19', 'CYP2C19']

haplotypes = ['CFTR F508del(CTT)', 'CFTR F508del(TCT)', 'CFTR G1244E',
'CFTR G1349D', 'CFTR G178R', 'CFTR G551D', 'CFTR G551S', 'CFTR S1251N',
'CFTR S1255P', 'CFTR S549N', 'CFTR S549R(A>C)', 'CFTR S549R(T>G)',
'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10',
'CYP2C19 *10', 'CYP2C19 *10', 'CYP2C19 *10']

def generate_haplotype_concepts(gene_concepts, genes, haplotypes):
    """Match gene id in (genes+haplotypes) with genes in concepts list."""

    # Convert concepts into dictionary:
    gc = { t3[2]:t3[0] for t3 in map(lambda s: s.split('|'), gene_concepts)}

    # Look up gene/haplotype prefix in gc dictionary for concept
    for gene, hap in zip(genes, haplotypes):
        concept = gc.get(gene)

        if concept is None:
            hprefix = hap.split()[0]
            concept = gc.get(hprefix)

            if concept is None:
                raise ValueError("Missing gene/prefix: {}/{} in hap {}".format(
                    gene, hprefix, hap))

        yield concept, hap

print("##### Concept|Haplotype, no ID #####")
for concept, haplotype in generate_haplotype_concepts(gene_concepts[0], genes, haplotypes):
    print("{}|{}".format(concept, haplotype))

print("\n\n##### ID|Concept|Haplotype #####")

for iden, (cept, hapl) in enumerate(generate_haplotype_concepts(gene_concepts[0], genes, haplotypes), start=21):
    print("+{}|{}|{}".format(iden, cept, hapl))
+1

, , , , , . , , , . .

for haplotype in haplotypes:
    if haplotype.split()[0] in genes:
        print(haplotype)

, , haplotype, . , , , , . split Python, "". . [0] , , .

Here you will see that we are using the keyword in. This will simply search for genes to see if the string is of interest to the list of interest genes. Now you have your own condition for each haplotype that interests you. From there, I believe that you can simply attach this code in something like:

for haplotype in haplotypes:
    if haplotype.split()[0] in genes:
        print("your code {}".format(haplotype))
0
source

Source: https://habr.com/ru/post/1678205/


All Articles