Perl / Linux filters a large file with the contents of another file

I am filtering a 580 MB file using the contents of another smaller file. File1 (smaller file)

chr start  End
1    123   150
2    245   320
2    450   600

File2 (large file)

chr pos RS ID A B C D E F
1   124 r2 3  s 4 s 2 s 2
1   165 r6 4  t 2 k 1 r 2
2   455 t2 4  2 4 t 3 w 3
3   234 r4 2  5 w 4 t 2 4

I would like to grab lines from File2 if the following criteria are met. File2.Chr == File1.Chr && File2.Pos > File1.Start && File2.Pos < File1.End Ive tried to use awk, but it works very slowly, I was also wondering if there is a better way to do the same?

Thank.

Here is the code that Im uses:

#!/usr/bin/perl -w
use strict;
use warnings;

my $bed_file = "/data/1000G/Hotspots.bed";#File1 smaller file
my $SNP_file = "/data/1000G/SNP_file.txt";#File2 larger file
my $final_file = "/data/1000G/final_file.txt"; #final output file

open my $in_fh, '<', $bed_file
        or die qq{Unable to open "$bed_file" for input: $!};

    while ( <$in_fh> ) {

     my $line_str = $_;

     my @data = split(/\t/, $line_str);

     next if /\b(?:track)\b/;# skip header line
     my $chr = $data[0]; $chr =~ s/chr//g; print "chr is $chr\n";
     my $start = $data[1]-1; print "start is $start\n";
     my $end = $data[2]+1; print "end is $end\n";

     my $cmd1 = "awk '{if(\$1==chr && \$2>$start && \$2</$end) print (\"chr\"\$1\"_\"\$2\"_\"\$3\"_\"\$4\"_\"\$5\"_\"\$6\"_\"\$7\"_\"\$8)}' $SNP_file >> $final_file"; print "cmd1\n";
     my $cmd2 = `awk '{if(\$1==chr && \$2>$start && \$2</$end) print (\"chr\"\$1\"_\"\$2\"_\"\$3\"_\"\$4\"_\"\$5\"_\"\$6\"_\"\$7\"_\"\$8)}' $SNP_file >> $final_file`; print "cmd2\n";

}
+4
source share
5 answers

Read the small file in the data structure and check on it all the lines of another file.

, . arrayrefs , .

use warnings 'all';
use strict;

my $ref_file = 'reference.txt';
open my $fh, '<', $ref_file or die "Can't open $ref_file: $!";
my @ref = map { chomp; [ split ] } grep { /\S/ } <$fh>;

my $data_file = 'data.txt';
open $fh, '<', $data_file or die "Can't open $data_file: $!";

# Drop header lines
my $ref_header  = shift @ref;    
my $data_header = <$fh>;

while (<$fh>) 
{
    next if not /\S/;  # skip empty lines
    my @line = split;

    foreach my $refline (@ref) 
    {
        next if $line[0] != $refline->[0];
        if ($line[1] > $refline->[1] and $line[1] < $refline->[2]) {
            print "@line\n";
        }
    }   
}
close $fh;

. . , last if, foreach , .

. , .

<$fh> , , grep . map chomp , arrayref [ ], , split. @ref.

$fh ( ), close.

, , . .

+2

: (HoA) chr:

use strict;
use warnings;

my $small_file = 'small.txt';
my $large_file = 'large.txt';

open my $small_fh, '<', $small_file or die $!;

my %small;

while (<$small_fh>){
    next if $. == 1;
    my ($chr, $start, $end) = split /\s+/, $_;
    push @{ $small{$chr} }, [$start, $end];
}

close $small_fh;

open my $large_fh, '<', $large_file or die $!;

while (my $line = <$large_fh>){
    my ($chr, $pos) = (split /\s+/, $line)[0, 1];

    if (defined $small{$chr}){
        for (@{ $small{$chr} }){
            if ($pos > $_->[0] && $pos < $_->[1]){
                print $line;
            }
        }
    }
}
+1

SQLite, . , - . , SQL- , .

, CSV SQLite CSV. CSV , s{ +}{,}g, Text::CSV_XS.

( ).

create table file1 (
    chr integer not null,
    start integer not null,
    end integer not null
);

create table file2 (
    chr integer not null,
    pos integer not null,
    rs integer not null,
    id integer not null,
    a char not null,
    b char not null,
    c char not null,
    d char not null,
    e char not null,
    f char not null
); 

, . , , .

create index chr_file1 on file1 (chr);
create index chr_file2 on file2 (chr);
create index pos_file2 on file2 (pos);
create index start_file1 on file1 (start);
create index end_file1 on file1 (end);

.

select *
from file2
join file1 on file1.chr == file2.chr
where file2.pos between file1.start and file1.end;

1,124,r2,3,s,4,s,2,s,2,1,123,150
2,455,t2,4,2,4,t,3,w,3,2,450,600

Perl DBI DBD::SQLite.

+1

, awk . awk , Perl, Python, OP, :

  • : chr = > start/end
  • chr / .

:

with open("smallfile.txt") as f:
    next(f) # skip title
    # build a dictionary with chr as key, and list of start,end as values
    d = collections.defaultdict(list)
    for line in f:
        toks = line.split()
        if len(toks)==3:
            d[int(toks[0])].append((int(toks[1]),int(toks[2])))


with open("largefile.txt") as f:
    next(f) # skip title
    for line in f:
        toks = line.split()
        chr_tok = int(toks[0])
        if chr_tok in d:
            # key is in dictionary
            pos = int(toks[1])
            if any(lambda x : t[0]<pos<t[1] for t in d[chr_tok]):
                print(line.strip())

We could be a little faster sorting the list of tuples and appyling bisectto avoid a linear search. This is only necessary if the list of tuples is large in a "small" file.

0
source

Power awk with one pass. Your code iterates over file2 as many times as there are lines in file1, so the execution time increases linearly. Please let me know if this one-pass solution is slower than other solutions.

awk 'NR==FNR {
    i = b[$1];        # get the next index for the chr
    a[$1][i][0] = $2; # store start
    a[$1][i][1] = $3; # store end
    b[$1]++;          # increment the next index
    next;
}

{
    p = 0;
    if ($1 in a) {
        for (i in a[$1]) {
            if ($2 > a[$1][i][0] && \
                $2 < a[$1][i][1])
                p = 1                 # set p if $2 in range
        }
    }
}

p {print}'

Single line

awk 'NR==FNR {i = b[$1];a[$1][i][0] = $2; a[$1][i][1] = $3; b[$1]++;next; }{p = 0;if ($1 in a){for(i in a[$1]){if($2>a[$1][i][0] && $2<a[$1][i][1])p=1}}}p' file1 file2
0
source

Source: https://habr.com/ru/post/1660226/


All Articles