Allen's Blog: Multi-Thread Job in Hadoop 實驗

問題定義

假設今有2 Jobs, 皆讀取相同格式之資料，但是每個Job 處理的方式與輸出資料皆不相同。

Job 1: Source Data A→ Map → Reduce → Output B

Job 2: Source Data A→ Map → Reduce → Output C

若今天以Job1 → Job 2的順序，分別需要 20 seconds, 30 seconds。則總時間需要 50 seconds。是否能夠平行處理的技巧，達到Performance的提升？

目標與方法

利用Multithreads 讓Job1與Job2 同時進行。期望透過平行處理的方式，讓工作能夠在30~50秒內做完，提升處理速度。

實驗設計與數據呈現

硬體環境：

伺服器2台：cloud1, cloud2。

伺服器硬體配置：

CPU: 雙路AMD 雙核心

記憶體：8GB

硬碟：1TB 7200 rpm.

軟體與資料環境：

Hadoop 0.20.x，master node為cloud1，slave node為cloud1與cloud2。

MySQL Database運行在cloud2。

實驗設計：

設計兩個Job，Job1與Job2，Job1的工作流程：

資料來源1 → WordCount ( MapReduce) →以資料格式1 輸出到MySQL Server

資料來源1 → WordCount ( MapReduce) →以資料格式2輸出到MySQL Server

注意！會區分資料格式1與資料格式2是因為Job1與Job2輸出的內容不太一樣。

資料集：自己產生之資料集，大小為556 KB。

數據呈現：

	Job1執行時間(秒)	Job2執行時間(秒)	總執行時間
Job1與Job2循序執行	21	21	42
Job1與Job2採多執行緒執行	21	21	21

結論
以上述小型測試發現，Multi-Thread Job in Hadoop或許是一個不錯的並行加速方案。

程式碼

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import java.io.DataInput;
import java.io.DataOutput;
import java.lang.Thread;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.mapred.lib.db.DBWritable;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.mapred.lib.db.DBConfiguration;
import org.apache.hadoop.mapred.lib.db.DBOutputFormat;
import org.apache.hadoop.mapred.lib.db.DBInputFormat;
/* 
MySQL DB Schema:

DROP TABLE IF EXISTS `WordCount`.`Counting`;
CREATE TABLE  `WordCount`.`Counting` (
`name` char(48) default NULL,
`count` int(11) default NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

*/

public class DBWordCount extends Thread
{
    public void run() //throws Exception
    {
 /* Start ! */
 try
 {
     JobClient.runJob(conf);
 }
 catch(Exception e)
 {
     // do nothing
 }
    }

    public void fnSetJob1(String[] args) throws Exception
    {
 conf.setJobName("MySQL DB Wordcount Job1");


 Class.forName("com.mysql.jdbc.Driver");

 conf.setInputFormat(TextInputFormat.class);
 conf.setOutputFormat(DBOutputFormat.class);
 FileInputFormat.setInputPaths(conf, new Path(args[0]));

 // Set up your host name and account
 String[] MyDBPath={"jdbc:mysql://MySQL主機位置:3306/WordCount","帳號", "密碼"}; 
 DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",MyDBPath[0], MyDBPath[1], MyDBPath[2]);

 // Setup Output MySQL Format
 DBOutputFormat.setOutput(conf, "Counting","name", "count");

 // Set Mapper and Reducer Class
 conf.setMapperClass(Map.class);
 //conf.setCombinerClass(Reduce.class);
 conf.setReducerClass(Reduce.class);

 conf.setMapOutputKeyClass(Text.class);
 conf.setMapOutputValueClass(IntWritable.class);
 conf.setOutputKeyClass(WordCountInfoRecord.class);
 conf.setOutputValueClass(NullWritable.class);
    }

    public void fnSetJob2(String[] args) throws Exception
    {
     //JobConf conf = new JobConf(DBWordCount.class);
     conf.setJobName("MySQL DB Wordcount Job2");


     Class.forName("com.mysql.jdbc.Driver");
     // Set up your host name and account
     String[] MyDBPath={"jdbc:mysql://MySQL主機位置:3306/WordCount","帳號", "密碼"}; 

     conf.setInputFormat(TextInputFormat.class);
     conf.setOutputFormat(DBOutputFormat.class);
     FileInputFormat.setInputPaths(conf, new Path(args[0]));

     // Setup MySQL Connection , default account:root , no password
     DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",MyDBPath[0], MyDBPath[1], MyDBPath[2]);

     // Setup Output MySQL Format
     DBOutputFormat.setOutput(conf, "Counting","name", "count");

     // Set Mapper and Reducer Class
     conf.setMapperClass(Map.class);
     //conf.setCombinerClass(Reduce.class);
     conf.setReducerClass(Reduce2.class);

     // I've tried all combinations , but the bug still happen.
     conf.setMapOutputKeyClass(Text.class);
     conf.setMapOutputValueClass(IntWritable.class);
     conf.setOutputKeyClass(WordCountInfoRecord.class);
     conf.setOutputValueClass(NullWritable.class);
    }

    JobConf conf = new JobConf(DBWordCount.class);
    //JobConf conf2 = new JobConf(DBWordCount.class);

    // Output Record Object
    static class WordCountInfoRecord implements Writable,  DBWritable 
    {
 public String name;
 public int count;
 public WordCountInfoRecord() {

 }

 public WordCountInfoRecord(String str, int c)
 {
     this.name = str;
     this.count = c;
 }

 public void readFields(DataInput in) throws IOException {
     this.name = Text.readString(in);
     this.count = in.readInt();
 }
 public void write(DataOutput out) throws IOException {
     Text.writeString(out, this.name);
     out.writeInt(this.count);
 }

 public void readFields(ResultSet result) throws SQLException {
     this.name = result.getString(1);
     this.count = result.getInt(2);
 }
 public void write(PreparedStatement stmt) throws SQLException {
     stmt.setString(1, this.name);
     stmt.setInt(2, this.count);
 }
 public String toString() {
     return new String(this.name + " " + this.count);
 }
    }

    public static class Map extends MapReduceBase implements Mapper 
    {

      private final static IntWritable one = new IntWritable(1);
      private Text word = new Text();

      public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException 
      {
        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreTokens()) {
          word.set(tokenizer.nextToken());
          output.collect(word, one);
        }
      }
    }


    public static class Reduce extends MapReduceBase implements Reducer 
    {
      public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException 
      {
        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
 // Output Data into MySQL
 output.collect(new WordCountInfoRecord(key.toString(),sum), NullWritable.get());
      }
    }

    public static class Reduce2 extends MapReduceBase implements Reducer 
    {
      public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException 
      {
        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
 // Output Data into MySQL
 output.collect(new WordCountInfoRecord("Job2_"+key.toString(),sum), NullWritable.get());
      }
    }

    public static void main(String[] args) throws Exception 
    {
 DBWordCount thread1=new DBWordCount();
 // Set Thread1
 thread1.fnSetJob1(args);
 DBWordCount thread2=new DBWordCount();
 // Set Thread2
 thread2.fnSetJob2(args);
 // Thread 1 Start
 thread1.start();
 // Thread 2 Start
 thread2.start();

    }
}

Allen's Blog

2011年7月1日星期五

Multi-Thread Job in Hadoop 實驗

沒有留言:

張貼留言

2011年7月1日 星期五

Multi-Thread Job in Hadoop 實驗

沒有留言:

張貼留言

2011年7月1日星期五