기록/그 외 프로젝트 기록

[Hadoop] Map Reduce를 위한 maven project 만들기

5월._. 2022. 10. 11.

728x90

[Hadoop] Map Reduce를 위한 maven project 만들기

하둡 클러스터를 통해서 mapreduce를 실행했는데, 이 때 꼭 jar파일이 필요했다.

미리 공부했던 방식은 ant를 이용한 방식이었는데, 내가 따로 코드를 처음부터 작성하다보니 그 방식을 공부하기보다 이미 알고있는 maven방식을 쓰는게 좋다는 결론을 내렸다.

여건 상 하둡 클러스터 서버에서 maven을 설치할 수 없었기 때문에 로컬에서 빌드한 후 서버로 이동시켰다.

1. pom.xml

81번째줄의 mainClass부분이 중요하다.

메인클래스를 지정해야 jar파일을 실행할 때 Driver에서 설정한 명령어로 다른 클래스를 실행할 수 있다.

 <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.hadoop.examples</groupId>
<artifactId>wayg</artifactId>
<packaging>jar</packaging>
<version>1.0</version>
<name>wayg</name>
<url>http://maven.apache.org</url>
<dependencies>
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>3.8.1</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-examples</artifactId>
        <version>2.7.3</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-common</artifactId>
        <version>2.7.3</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.7.3</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
        <groupId>org.openkoreantext</groupId>
        <artifactId>open-korean-text</artifactId>
        <version>2.3.1</version>
    </dependency>
 
</dependencies>
<build>
    <plugins>
    <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-shade-plugin</artifactId>
        <version>2.3</version>
        <configuration>
 
        <transformers>
            <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
            </transformer>
        </transformers>
        </configuration>
        <executions>
        <execution>
            <phase>package</phase>
                <goals>
                <goal>shade</goal>
                </goals>
        </execution>
        </executions>
        </plugin>
    <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.6.1</version>
        <configuration>
        <source>1.8</source>
        <target>1.8</target>
        </configuration>
    </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-jar-plugin</artifactId>
            <version>3.0.2</version>
            <configuration>
                <archive>
                    <manifest>
                        <mainClass>org.apache.hadoop.examples.Driver</mainClass>
                        <addClasspath>true</addClasspath>
                    </manifest>
                </archive>
            </configuration>
        </plugin>
    </plugins>
</build>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
    </properties>
</project>

2. Driver.java

클래스가 여러 개 있을 때 그걸 컨트롤하기 위한 클래스다.

여기에서는 wordcount라는 명령으로 WordCount.class를 실행시켰다.

 package org.apache.hadoop.examples;
 
import org.apache.hadoop.util.ProgramDriver;
 
public class Driver {
	public static void main(String[] args) {
		int exitCode = -1;
		ProgramDriver pgd = new ProgramDriver();
		try {
 
			pgd.addClass("wordcount", WordCount.class, "A map/reduce program that performs word counting.");
 
			pgd.driver(args);
			exitCode = pgd.run(args);
		}
		catch(Throwable e) {
			e.printStackTrace();
		}
 
		System.exit(exitCode);
	}
}

3. wordcount.java

가장 기본적인 하둡 예제다. 내가 짠 코드는 아니고, 예시를 위해서 들고 왔다.

영어기준으로 만들어졌기 때문에 공백을 기준으로 단어를 잘라서 카운팅한다.

 package org.apache.hadoop.examples;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
import java.io.IOException;
import java.util.StringTokenizer;
 
public class WordCount {
 
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: wordcount <in> <out>");
			System.exit(2);
		}
 
		FileSystem hdfs = FileSystem.get(conf);
		Path output = new Path(otherArgs[1]);
		if (hdfs.exists(output))
			hdfs.delete(output, true);
 
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
 
	public static class TokenizerMapper
			extends Mapper<Object, Text, Text, IntWritable> {
 
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
 
		public void map(Object key, Text value, Context context
		) throws IOException, InterruptedException {
			StringTokenizer itr = new StringTokenizer(value.toString());
			while (itr.hasMoreTokens()) {
				word.set(itr.nextToken());
				context.write(word, one);
			}
		}
	}
 
	public static class IntSumReducer
			extends Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable result = new IntWritable();
 
		public void reduce(Text key, Iterable<IntWritable> values,
						   Context context
		) throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);
		}
	}
}

4. 실행

jar파일을 하둡 클러스터 서버로 옮긴 뒤 jar파일 위치에서 해당 명령어를 수행했다. 물론 명령어를 치기 전에 hdfs 환경에 input data가 있어야 한다.

hadoop jar jar파일명.jar wordcount(Driver에 선언한이름) input데이터위치 output폴더명

출처 : https://learn.microsoft.com/ko-kr/azure/hdinsight/hadoop/apache-hadoop-develop-deploy-java-mapreduce-linux

'기록 > 그 외 프로젝트 기록' 카테고리의 다른 글

[NLP] Open Korean Text 자바로 구현 (0)	2022.10.10
[Hadoop] Map Reduce - timed out after 600secs (0)	2022.09.28
[Hadoop] Python으로 wordcount하기(Hadoop Streaming) (0)	2022.09.22

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

[Hadoop] Map Reduce를 위한 maven project 만들기

1. pom.xml

2. Driver.java

3. wordcount.java

4. 실행

'기록 > 그 외 프로젝트 기록' 카테고리의 다른 글

댓글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

	<?xml version="1.0" encoding="UTF-8"?>
	<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>org.apache.hadoop.examples</groupId>
	<artifactId>wayg</artifactId>
	<packaging>jar</packaging>
	<version>1.0</version>
	<name>wayg</name>
	<url>http://maven.apache.org</url>
	<dependencies>
	<dependency>
	<groupId>junit</groupId>
	<artifactId>junit</artifactId>
	<version>3.8.1</version>
	<scope>test</scope>
	</dependency>
	<dependency>
	<groupId>org.apache.hadoop</groupId>
	<artifactId>hadoop-mapreduce-examples</artifactId>
	<version>2.7.3</version>
	<scope>provided</scope>
	</dependency>
	<dependency>
	<groupId>org.apache.hadoop</groupId>
	<artifactId>hadoop-mapreduce-client-common</artifactId>
	<version>2.7.3</version>
	<scope>provided</scope>
	</dependency>
	<dependency>
	<groupId>org.apache.hadoop</groupId>
	<artifactId>hadoop-common</artifactId>
	<version>2.7.3</version>
	<scope>provided</scope>
	</dependency>
	<dependency>
	<groupId>org.openkoreantext</groupId>
	<artifactId>open-korean-text</artifactId>
	<version>2.3.1</version>
	</dependency>

	</dependencies>
	<build>
	<plugins>
	<plugin>
	<groupId>org.apache.maven.plugins</groupId>
	<artifactId>maven-shade-plugin</artifactId>
	<version>2.3</version>
	<configuration>

	<transformers>
	<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
	</transformer>
	</transformers>
	</configuration>
	<executions>
	<execution>
	<phase>package</phase>
	<goals>
	<goal>shade</goal>
	</goals>
	</execution>
	</executions>
	</plugin>
	<plugin>
	<groupId>org.apache.maven.plugins</groupId>
	<artifactId>maven-compiler-plugin</artifactId>
	<version>3.6.1</version>
	<configuration>
	<source>1.8</source>
	<target>1.8</target>
	</configuration>
	</plugin>
	<plugin>
	<groupId>org.apache.maven.plugins</groupId>
	<artifactId>maven-jar-plugin</artifactId>
	<version>3.0.2</version>
	<configuration>
	<archive>
	<manifest>
	<mainClass>org.apache.hadoop.examples.Driver</mainClass>
	<addClasspath>true</addClasspath>
	</manifest>
	</archive>
	</configuration>
	</plugin>
	</plugins>
	</build>
	<properties>
	<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
	</properties>
	</project>

	package org.apache.hadoop.examples;

	import org.apache.hadoop.util.ProgramDriver;

	public class Driver {
	public static void main(String[] args) {
	int exitCode = -1;
	ProgramDriver pgd = new ProgramDriver();
	try {

	pgd.addClass("wordcount", WordCount.class, "A map/reduce program that performs word counting.");

	pgd.driver(args);
	exitCode = pgd.run(args);
	}
	catch(Throwable e) {
	e.printStackTrace();
	}

	System.exit(exitCode);
	}
	}

	package org.apache.hadoop.examples;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.util.GenericOptionsParser;

	import java.io.IOException;
	import java.util.StringTokenizer;

	public class WordCount {

	public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	if (otherArgs.length != 2) {
	System.err.println("Usage: wordcount <in> <out>");
	System.exit(2);
	}

	FileSystem hdfs = FileSystem.get(conf);
	Path output = new Path(otherArgs[1]);
	if (hdfs.exists(output))
	hdfs.delete(output, true);

	Job job = new Job(conf, "word count");
	job.setJarByClass(WordCount.class);
	job.setMapperClass(TokenizerMapper.class);
	job.setCombinerClass(IntSumReducer.class);
	job.setReducerClass(IntSumReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	public static class TokenizerMapper
	extends Mapper<Object, Text, Text, IntWritable> {

	private final static IntWritable one = new IntWritable(1);
	private Text word = new Text();

	public void map(Object key, Text value, Context context
	) throws IOException, InterruptedException {
	StringTokenizer itr = new StringTokenizer(value.toString());
	while (itr.hasMoreTokens()) {
	word.set(itr.nextToken());
	context.write(word, one);
	}
	}
	}

	public static class IntSumReducer
	extends Reducer<Text, IntWritable, Text, IntWritable> {
	private IntWritable result = new IntWritable();

	public void reduce(Text key, Iterable<IntWritable> values,
	Context context
	) throws IOException, InterruptedException {
	int sum = 0;
	for (IntWritable val : values) {
	sum += val.get();
	}
	result.set(sum);
	context.write(key, result);
	}
	}
	}

[Hadoop] Map Reduce를 위한 maven project 만들기

1. pom.xml

2. Driver.java

3. wordcount.java

4. 실행

'기록 > 그 외 프로젝트 기록' 카테고리의 다른 글

댓글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

1. pom.xml

2. Driver.java

3. wordcount.java

4. 실행