How to read duplicate words from a directory or folder

I got this program below from a coding site.

The following code reads a text file and finds duplicate words.

To read from each text file and display its duplicate words, count line by line. And how to call these files if it is not stored as a String, I used a buffered reader, but I do not get my output.

My questions:

  • How can I make a program read several files from a given folder?

  • How to save results in Excel file format?

Any suggestions are welcome.

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;


public class MaxDuplicateWordCount {

    public Map<String, Integer> getWordCount(String fileName){

        FileInputStream fis = null;
        DataInputStream dis = null;
        BufferedReader br = null;
        Map<String, Integer> wordMap = new HashMap<String, Integer>();

        try {
            fis = new FileInputStream(fileName);
            dis = new DataInputStream(fis);
            br = new BufferedReader(new InputStreamReader(dis));
            String line = null; 
            while((line = br.readLine()) != null){
                StringTokenizer st = new StringTokenizer(line, " ");
                while(st.hasMoreTokens()){
                    String tmp = st.nextToken().toLowerCase();
                    if(wordMap.containsKey(tmp)){
                        wordMap.put(tmp, wordMap.get(tmp)+1);
                    } else {
                        wordMap.put(tmp, 1);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally{
            try{if(br != null) br.close();}catch(Exception ex){}
        }
        return wordMap;
    }

    public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap){

        Set<Entry<String, Integer>> set = wordMap.entrySet();
        List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
        Collections.sort( list, new Comparator<Map.Entry<String, Integer>>()
        {
            public int compare( Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2 )
            {
                return (o2.getValue()).compareTo( o1.getValue() );
            }
        } );
        return list;
    }

    public static void main(String a[]){



        MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
        Map<String, Integer> wordMap = mdc.getWordCount("E:\\Blog 39.txt");

        List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
        for(Map.Entry<String, Integer> entry:list){
            System.out.println(entry.getKey()+" ="+entry.getValue());
        }
    }
}
+4
source share
2 answers

Introduction

After talking with the OP, briefly what the OP requires:

1- / , , , Unicode .
 2- OP- , Unicode ( OP Excel (.XLS) - Unicode Excel)

:

1 ()
2 ,
3 .
4 Excel .

POI, XLS. poi/poi-3.5-FINAL.jar.zip(1,372 k), .

import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;

, :

final static String WORKSPACE = "C:/testfolder/";

private static void createOutputFolder(String outputFolderName) {
    File outputDirectory = new File(WORKSPACE + outputFolderName);

    if (!outputDirectory.exists()) {
        try {
            outputDirectory.mkdir();
        } catch (Exception e) {
        }
    }
}

private static void exlCreator() {

    String outputFolder = "output/";
    String fileName, fileNameWPathInput;
    int serialNumber = 1;
    createOutputFolder(outputFolder);

    MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
    File folder = new File(WORKSPACE);
    File[] listOfFiles = folder.listFiles();

    for (int i = 0; i < listOfFiles.length; i++) {
        if (listOfFiles[i].isFile()) {
            fileName = listOfFiles[i].getName();
            fileNameWPathInput = WORKSPACE + fileName;
            Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
            List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
            String fileNameWPathOutput = WORKSPACE + outputFolder +
                    fileName.substring(0, fileName.length() - 4)
                    + "output.xls";
            try {
                HSSFWorkbook workbook = new HSSFWorkbook();
                HSSFSheet sheet = workbook.createSheet("ResultSheet");

                HSSFRow rowhead = sheet.createRow((short) 0);
                rowhead.createCell(0).setCellValue("Serial No.");
                rowhead.createCell(1).setCellValue("Word");
                rowhead.createCell(2).setCellValue("Count");

                for (Map.Entry<String, Integer> entry : list) {
                    HSSFRow row = sheet.createRow((short) serialNumber);
                    row.createCell(0).setCellValue(serialNumber);
                    row.createCell(1).setCellValue(entry.getKey());
                    row.createCell(2).setCellValue(entry.getValue());
                    serialNumber++;
                }
                FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
                workbook.write(fileOut);
                fileOut.close();
                serialNumber = 1;
                System.out.println(fileNameWPathOutput + " is created");

            } catch (Exception ex) {
                System.out.println(ex);
            }
        }
    }


}

public static void main(String [] args) throws IOException {
    exlCreator();
}

, . , Excel, Unicode , : enter image description here

POI
POI
Unicode CSV
CSV

, OP

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
//for Excel ark
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;

public class MaxDuplicateWordCount {

    public Map<String, Integer> getWordCount(String fileName) {

        FileInputStream fis = null;
        DataInputStream dis = null;
        BufferedReader br = null;
        Map<String, Integer> wordMap = new HashMap<String, Integer>();

        try {
            fis = new FileInputStream(fileName);
            dis = new DataInputStream(fis);
            br = new BufferedReader(new InputStreamReader(dis));
            String line = null;
            while ((line = br.readLine()) != null) {
                StringTokenizer st = new StringTokenizer(line, " ");
                while (st.hasMoreTokens()) {
                    String tmp = st.nextToken().toLowerCase();
                    if (wordMap.containsKey(tmp)) {
                        wordMap.put(tmp, wordMap.get(tmp) + 1);
                    } else {
                        wordMap.put(tmp, 1);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (br != null) br.close();
            } catch (Exception ex) {
            }
        }
        return wordMap;
    }

    public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap) {

        Set<Entry<String, Integer>> set = wordMap.entrySet();
        List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {

            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {

                return (o2.getValue()).compareTo(o1.getValue());
            }


        });
        return list;
    }

    final static String WORKSPACE = "C:/testfolder/";

    private static void createOutputFolder(String outputFolderName) {
        File outputDirectory = new File(WORKSPACE + outputFolderName);

        if (!outputDirectory.exists()) {
            try {
                outputDirectory.mkdir();
            } catch (Exception e) {
            }
        }
    }

    private static void exlCreator() {

        String outputFolder = "output/";
        String fileName, fileNameWPathInput;
        int serialNumber = 1;
        createOutputFolder(outputFolder);

        MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
        File folder = new File(WORKSPACE);
        File[] listOfFiles = folder.listFiles();

        for (int i = 0; i < listOfFiles.length; i++) {
            if (listOfFiles[i].isFile()) {
                fileName = listOfFiles[i].getName();
                fileNameWPathInput = WORKSPACE + fileName;
                Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
                List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
                String fileNameWPathOutput = WORKSPACE + outputFolder +
                        fileName.substring(0, fileName.length() - 4)
                        + "output.xls";
                try {
                    HSSFWorkbook workbook = new HSSFWorkbook();
                    HSSFSheet sheet = workbook.createSheet("ResultSheet");

                    HSSFRow rowhead = sheet.createRow((short) 0);
                    rowhead.createCell(0).setCellValue("Serial No.");
                    rowhead.createCell(1).setCellValue("Word");
                    rowhead.createCell(2).setCellValue("Count");

                    for (Map.Entry<String, Integer> entry : list) {
                        HSSFRow row = sheet.createRow((short) serialNumber);
                        row.createCell(0).setCellValue(serialNumber);
                        row.createCell(1).setCellValue(entry.getKey());
                        row.createCell(2).setCellValue(entry.getValue());
                        serialNumber++;
                    }
                    FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
                    workbook.write(fileOut);
                    fileOut.close();
                    serialNumber = 1;
                    System.out.println(fileNameWPathOutput + " is created");

                } catch (Exception ex) {
                    System.out.println(ex);
                }
            }
        }


    }

    public static void main(String[] args) throws IOException {
        exlCreator();
    }
}
+2

, , .

File folder = new File("/Users/you/folder/");
File[] listOfFiles = folder.listFiles();

for (File file : listOfFiles) {

    if (file.isFile()) {
        /*
         * Here if your file is not a text file 
         * If I undersood you correct:
         *      "And how to call that files if it is not stored as String"
         * you can get it as byte[] and parse it to String
         */
        byte[] bytes = Files.readAllBytes(file.toPath());
        String decoded = new String(bytes, "UTF-8");
        String[] words = decoded.split("\\s+");
        for (int i = 0; i < words.length; i++) {
            /*  You may want to check for a non-word character before blindly
             *  performing a replacement
             *  It may also be necessary to adjust the character class
             */
             words[i] = words[i].replaceAll("[^\\w]", "");
             //Here are all the words from a file. You can do whatever you want with them
         }
     }

}
0

Source: https://habr.com/ru/post/1625082/


All Articles